Ejemplo n.º 1
0
def run():
    collection = mongo.task.data_report

    while True:
        items = collection.find({'processStatus': 0}).limit(10)

        for item in items:
            try:
                startDate, endDate = item['param']['startDate'], item['param'][
                    'endDate']
                logger.info('processing %s ~ %s.xlsx' % (startDate, endDate))
                df, columns = data_code.run2(conn,
                                             mongo,
                                             startDate=startDate,
                                             endDate=endDate,
                                             param=item['param'])
                df.to_excel('test.xlsx',
                            index=0,
                            columns=columns,
                            encoding="utf-8")
                path = os.path.join(sys.path[0], 'test.xlsx')

                fileid = util.get_uuid()

                oss = oss2_helper.Oss2Helper("xiniudata-report")
                fp = file(path, "rb")
                oss.put(
                    fileid,
                    fp,
                    headers={
                        "Content-Type":
                        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                        "x-oss-meta-filename":
                        'funding_news_report_%s~%s.xlsx' % (startDate, endDate)
                    })
                fp.close()

                logger.info('uploaded funding_news_report_%s ~ %s.xlsx' %
                            (startDate, endDate))
                collection.update_one({'_id': item['_id']}, {
                    '$set': {
                        'processStatus': 1,
                        'link':
                        'http://www.xiniudata.com/file/report/%s' % fileid
                    }
                })
            except Exception as e:
                logger.info(e)

        logger.info('sleep')
        time.sleep(30)
Ejemplo n.º 2
0
def process(rep):
    res = 0
    while True:
        delete()
        res += 1
        if res > 20:
            return False
        run(rep["durl"])
        logger.info("saving done")
        file_path = "download.pdf"
        if not os.path.isfile(file_path):
            return False

        # logger.info(file_path)
        # try:
        #     fp = open(file_path, "rb")
        #     pdfReader = PdfFileReader(fp)
        #     logger.info("read done")
        #     if pdfReader.isEncrypted:
        #         return False
        #
        # except:
        #     continue
        # pages = pdfReader.getNumPages()
        pages, pdfcreationDate = getPage(file_path)
        if pdfcreationDate is None:
            return False
        # fp.close()
        size = os.path.getsize(file_path)

        md5 = util.get_file_md5(file_path)

        if check_file_exists(md5, rep["title"]):
            return False

        fileid = util.get_uuid()
        logger.info("%s, %s, %s, %s, %s, %s", rep["title"], size,
                    pdfcreationDate, pages, md5, fileid)

        oss = oss2_helper.Oss2Helper("xiniudata-report")
        fp = file(file_path, "rb")
        oss.put(fileid,
                fp,
                headers={
                    "Content-Type": "application/pdf",
                    "x-oss-meta-filename": rep["filename"]
                })
        fp.close()

        mongo = db.connect_mongo()
        mongo.article.report.insert_one({
            "source":
            rep["source"],
            "description":
            None,
            "title":
            rep["title"],
            "filename":
            rep["filename"],
            "size":
            size,
            "pdfCreationDate":
            pdfcreationDate,
            "pages":
            pages,
            "md5":
            md5,
            "fileid":
            fileid,
            "createTime":
            datetime.datetime.now() - datetime.timedelta(hours=8),
            "modifyTime":
            datetime.datetime.now() - datetime.timedelta(hours=8),
            "type":
            78001
        })
        mongo.close()
        return True
Ejemplo n.º 3
0
import oss2_helper
#logger
loghelper.init_logger("stock_aggregate", stream=True)
logger = loghelper.get_logger("stock_aggregate")

source_map = {
    13400: "全国中小企业股份转让系统|http://www.neeq.com.cn",
    13401: "上海证券交易所|http://www.sse.com.cn",
    13402: "深圳证券交易所|http://www.szse.cn"
}
round_map = {13400: 1105, 13401: 1110, 13402: 1110}

# kafka
kafkaProducer = None

oss2put = oss2_helper.Oss2Helper()


def init_kafka():
    global kafkaProducer
    (url) = config.get_kafka_config()
    kafka = KafkaClient(url)
    # HashedPartitioner is default
    kafkaProducer = SimpleProducer(kafka)


def send_message(company_id, action):
    if kafkaProducer is None:
        init_kafka()

    #action: create, delete
Ejemplo n.º 4
0
def process(org):
    if org["coldcall_imap_server"] is None:
        return

    logger.info("orgId: %s, orgName: %s", org["id"], org["name"])

    re_name = re.compile(
        '([\[\(] *)?(RE?S?|FYI|RIF|I|FS|VB|RV|ENC|ODP|PD|YNT|ILT|SV|VS|VL|AW|WG|ΑΠ|ΣΧΕΤ|ΠΡΘ|תגובה|הועבר|主题|转发|FWD?) *([-:;)\]][ :;\])-]*|$)|\]+ *$',
        re.IGNORECASE)

    while True:
        msgs = email_reader.receive(org["coldcall_imap_server"],
                                    org["coldcall_imap_port"],
                                    org["coldcall_username"],
                                    org["coldcall_password"],
                                    one=True)
        if len(msgs) == 0:
            break

        for msg in msgs:
            if msg["html"] is not None:
                parser = html2text.HTML2Text()
                parser.ignore_emphasis = True
                parser.single_line_break = True
                msg["html_text"] = parser.handle(msg["html"])
            else:
                msg["html_text"] = None

            logger.info(msg["subject"])
            logger.info(msg["from"])
            logger.info(msg["to"])
            logger.info(msg["cc"])
            # logger.info(msg["body"])
            # logger.info(msg["html_text"])
            logger.info("attachments=%d" % len(msg["attachments"]))
            for attach in msg["attachments"]:
                logger.info(attach.name)

            title = re_name.sub('', msg["subject"]).strip()
            title_md5 = util.md5str(title)

            #insert
            conn = db.connect_torndb()
            cc = conn.get(
                "select * from sourcedeal where orgId=%s and titleMd5=%s and origin=%s limit 1",
                org["id"], title_md5, msg["from"])
            conn.close()
            if cc is not None:
                logger.info("%s Exists!" % title)
                continue

            content = msg["html_text"]
            if content is None:
                content = msg["body"]
            if content is None:
                content = ""
            content = content.strip()
            if len(content) > 20000:
                content = content[0:20000]

            sponsor_id = find_user(org["id"], msg["from"])
            logger.info("sponsor_id=%s" % sponsor_id)
            assignee_id = find_user(org["id"], msg["cc"])
            logger.info("assignee_id=%s" % assignee_id)

            conn = db.connect_torndb()
            cc_id = conn.insert(
                "insert sourcedeal(title,titleMd5,content,orgId,createTime,origin,assignee,sponsor) \
                                                values(%s,%s,%s,%s,%s,%s,%s,%s)",
                title, title_md5, content, org["id"], msg["date"], msg["from"],
                assignee_id, sponsor_id)

            if assignee_id is None:
                ids = get_investment_manager_ids(org["id"])
                assignee_id = choice(ids)
                conn.update("update sourcedeal set assignee=%s where id=%s",
                            assignee_id, cc_id)
                conn.insert(
                    "insert sourcedeal_forward(sourcedealId,toUserId,createTime) "
                    "values(%s,%s,%s)", cc_id, assignee_id, msg["date"])
            else:
                conn.insert(
                    "insert sourcedeal_forward(sourcedealId,fromUserId,toUserId,createTime) "
                    "values(%s,%s,%s,%s)", cc_id, sponsor_id, assignee_id,
                    msg["date"])

            for attach in msg["attachments"]:
                if attach.name is not None and attach.name.strip() != "":
                    name = attach.name.strip()
                    if not name.lower().endswith("pdf") and \
                            not name.lower().endswith("rar") and \
                            not name.lower().endswith("zip") and \
                            not name.lower().endswith("7z") and \
                            not name.lower().endswith("ppt") and \
                            not name.lower().endswith("pptx") and \
                            not name.lower().endswith("doc") and \
                            not name.lower().endswith("docx") and \
                            not name.lower().endswith("xls") and \
                            not name.lower().endswith("xlsx"):
                        continue

                    (content_type, encoding) = mimetypes.guess_type(name)
                    if content_type is None:
                        content_type = "application/octet-stream"
                    data = attach.getvalue()
                    # mongo = db.connect_mongo()
                    # imgfs = gridfs.GridFS(mongo.gridfs)
                    # logo_id = imgfs.put(data, content_type=content_type, filename=name)
                    # mongo.close()
                    logo_id = util.get_uuid()
                    logger.info("gridfs logo_id=%s" % logo_id)

                    oss2 = oss2_helper.Oss2Helper()
                    headers = {"Content-Type": content_type}
                    oss2.put(str(logo_id), data, headers=headers)

                    conn.insert(
                        "insert sourcedeal_file(sourcedealId,filename,fileId,createTime) "
                        "values(%s,%s,%s,%s)", cc_id, name, logo_id,
                        msg["date"])
            conn.close()
Ejemplo n.º 5
0
from bson import ObjectId

reload(sys)
sys.setdefaultencoding("utf-8")

sys.path.append(
    os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util'))
import loghelper, db, util, oss2_helper

#logger
loghelper.init_logger("migrate_file", stream=True)
logger = loghelper.get_logger("migrate_file")

mongo = db.connect_mongo()
grid = GridFS(mongo.gridfs)
oss2 = oss2_helper.Oss2Helper()


def save_oss2_image(grid_id):
    if grid_id is None or grid_id.strip() == "":
        return

    item = mongo.temp.gridid.find_one({"gridid": grid_id})
    if item is not None:
        return

    out = grid.get(ObjectId(grid_id))
    logger.info(out.name)
    img, xsize, ysize = util.convert_image(out, out.name)
    headers = {"Content-Type": "image/jpeg"}
    oss2.put(grid_id, img, headers=headers)
Ejemplo n.º 6
0
def process(dir_path, filename):
    # logger.info(filename)
    file_path = os.path.join(dir_path, filename)
    if not os.path.isfile(file_path):
        return False
    if not filename.lower().endswith(".pdf"):
        return False
    # logger.info(file_path)

    fp = file(file_path, "rb")
    pdfReader = PdfFileReader(fp)
    if pdfReader.isEncrypted:
        fp.close()
        logger.info("File encrypted! filename: %s", filename)
        decrypt_pdf(file_path)
        fp = file(file_path, "rb")
        pdfReader = PdfFileReader(fp)

    # creationDate = pdfReader.documentInfo.get("/CreationDate")
    # if not isinstance(creationDate, str):
    #     try:
    #         creationDate = creationDate.getObject()
    #     except:
    #         traceback.print_exc()
    #         return False

    pages = pdfReader.getNumPages()
    fp.close()

    # try:
    #     datestring = creationDate[2:-7]
    #     ts = strptime(datestring, "%Y%m%d%H%M%S")
    # except:
    #     traceback.print_exc()
    #     return False
    # dt = datetime.fromtimestamp(mktime(ts)) - timedelta(hours=8)
    ts = os.path.getctime(file_path)
    dt = datetime.fromtimestamp(ts) - timedelta(hours=8)

    size = os.path.getsize(file_path)
    title = filename[0:-4].strip()
    source = None
    if u":" in title:
        strs = title.split(u":", 1)
        source = strs[0]
        title = strs[1]

    md5 = util.get_file_md5(file_path)

    if check_file_exists(md5, title):
        return True

    fileid = util.get_uuid()
    logger.info("%s, %s, %s, %s, %s, %s", title, size, dt, pages, md5, fileid)

    oss = oss2_helper.Oss2Helper("xiniudata-report")
    fp = file(file_path, "rb")
    oss.put(fileid,
            fp,
            headers={
                "Content-Type": "application/pdf",
                "x-oss-meta-filename": filename.strip()
            })
    fp.close()

    save(source, filename, title, size, dt, pages, md5, fileid)
    return True