Ejemplo n.º 1
0
    def clear_items(self):

        global logger_sourcing
        file_path = os.path.join(
            os.path.split(os.path.realpath(__file__))[0], u'dumps/saoanzi.csv')
        data = []
        for anzi in dbutil.get_daily_saoanzi_sources(self.db, self.today):
            cactive = dbutil.get_company_active(self.db, anzi.companyId)
            need_verify = self.tcg.need_verify(anzi.companyId)
            if need_verify or (cactive != 'Y'):
                self.tcg.generate_tc(
                    json.dumps({
                        'id': anzi.companyId,
                        'source': 'track_saoanzi'
                    }))
                dbutil.update_saoanzi_item_status(self.db, anzi.saoanziItemId,
                                                  'P')
            elif not self.__valid_message(anzi):
                dbutil.update_saoanzi_item_status(self.db, anzi.saoanziItemId,
                                                  'N')
            else:
                dbutil.update_saoanzi_item_status(self.db, anzi.saoanziItemId,
                                                  'Y')
            url = "http://pro.xiniudata.com/validator/#/company/%s/overview" \
                  % dbutil.get_company_code(self.db, anzi.companyId)
            # sources = ';'.join([s.name for s in dbutil.get_saoanzi_item_sources(self.db, anzi.id)])
            source = anzi.source
            need_verify = u'需要检查' if (need_verify or
                                      (cactive != 'Y')) else u'不需要检查'
            data.append([
                dbutil.get_company_name(self.db, anzi.companyId), url,
                need_verify, anzi.createTime, source
            ])
        if not data:
            return
        # send email
        data = pandas.DataFrame(data)
        data.to_csv(file_path, encoding='utf_8_sig')
        # stat_verify = {title: len(set(detail[0])) for title, detail in data.groupby(3)}
        stat_verify = '<br/>'.join([
            '%s\t%s' % (title, len(set(detail[0])))
            for title, detail in data.groupby(2)
        ])
        # stat_source = {title: len(detail) for title, detail in data.groupby(5)}
        stat_source = '<br/>'.join([
            '%s\t%s' % (title, len(detail))
            for title, detail in data.groupby(4)
        ])
        stat = u'去重公司数<br/>%s<br/>每个源下的公司数<br/>%s\n' % (stat_verify,
                                                        stat_source)
        receivers = ['victor', 'erin', 'weiguangxiao', 'gewei']
        receivers = ';'.join(['*****@*****.**' % r for r in receivers])
        title = u'扫案子项目列表 %s' % self.current_check_time.strftime('%Y-%m-%d %H')
        content = u'%s检查,今天共有%s个扫案子条目<br/>%s' % \
                  (self.current_check_time.strftime('%Y-%m-%d %H:%M'), len(data), stat)
        send_mail_file(u'烯牛扫案子后台', u'烯牛扫案子后台', "*****@*****.**",
                       receivers, title, content, file_path)
Ejemplo n.º 2
0
def send_qmp_email():
    print('this time:%s to send email' % datetime.datetime.now())
    hour = time.localtime()[3]
    mongo = db.connect_mongo()
    collection = mongo.raw.qmp_rz_incr
    if hour == 8:
        items = list(collection.find().sort('createtime', -1).limit(50))
    else:
        date = datetime.date.today().strftime('%Y-%m-%d')
        items = list(collection.find({'date': date}))
    mongo.close()
    cnt = len(items)
    from_alias = 'Hush'
    reply_alias = 'Hush'
    reply_email = '*****@*****.**'
    # to = '*****@*****.**'
    to = '[email protected];[email protected];[email protected];[email protected];[email protected];[email protected]'
    print('*******')
    subject = '企名片日常融资事件'
    content = '<html>共<b>%d</b>起融资事件,请查看附件</html>' % cnt
    file = 'qmp_rz_day.xls'
    wb = xlwt.Workbook()
    ws = wb.add_sheet('A Work Sheet')
    ws.write(0, 0, 'Product')
    ws.write(0, 1, 'Lunci')
    ws.write(0, 2, 'Date')
    ws.write(0, 3, 'Source')
    ws.write(0, 4, 'Jianjie')
    i = 1
    for item in items:
        product = item.get('product')
        lunci = item.get('lunci')
        # Date = item.get('Date')
        date = item.get('news_time')
        # date = Date + ' ' + date
        jianjie = item.get('weiyu').decode('utf-8')
        source = item.get('qmp_url').decode('utf-8')
        if len(source) > 255:
            sources = source
        else:
            n = "HYPERLINK"
            sources = xlwt.Formula(n + '("%s";"%s")' % (source, source))
        ws.write(i, 0, product)
        ws.write(i, 1, lunci)
        ws.write(i, 2, date)
        ws.write(i, 3, sources)
        ws.write(i, 4, jianjie)
        i += 1
    wb.save(file)
    email_helper.send_mail_file(from_alias, reply_alias, reply_email, to,
                                subject, content, file)
    print('done')
Ejemplo n.º 3
0
def corp_merge3():
    tline = ""
    n = 0
    n1 = 0
    n2 = 0
    n3 = 0
    n4 = 0
    n5 = 0
    n6 = 0
    n7 = 0
    conn = db.connect_torndb()
    cnames = conn.query(
        "select name,count(*) as cnt from corporate_alias where (active is null or active !='N') "
        "and name is not null and name!=''  group by name having cnt>1")

    # cnames = conn.query("select fullName,count(*) as cnt from corporate where (active is null or active !='N') "
    #                     "and fullName='上海中慎网络科技有限公司' group by fullName having cnt>1")
    logger.info("total names: %s", len(cnames))

    for cname in cnames:
        pnames = []
        fundingFlag = False
        cfullFlag = True
        full_name = cname["name"]
        corporate_ids = []
        corporate_ids_f = []
        stockFlag = False

        if full_name is None or full_name.strip() == "" or full_name.strip() == "-" \
                or full_name.strip() == "个人" or full_name.strip() == "扎堆":
            continue

        corporate_aliases = conn.query(
            "select * from corporate_alias where name=%s and (active is null or active !='N')",
            full_name)
        for caa in corporate_aliases:
            ca = conn.get(
                "select * from corporate where (active is null or active !='N') and id=%s",
                caa["corporateId"])
            if ca is None: continue
            # if ca["fullName"] != full_name: continue

            c_stock = conn.get(
                "select * from corporate_stock_exchange_rel where corporateId=%s limit 1",
                ca["id"])
            if c_stock is not None:
                stockFlag = True
                continue

            company = conn.get(
                "select * from company where corporateId=%s and (active is null or active='Y') limit 1",
                ca["id"])
            if company is not None:

                if ca["id"] not in corporate_ids:
                    corporate_ids.append(int(ca["id"]))

                    if ca["fullName"] != full_name:
                        cfullFlag = False
                    else:
                        if ca["id"] not in corporate_ids_f:
                            corporate_ids_f.append(int(ca["id"]))

                    funding = conn.get(
                        "select * from funding where corporateId=%s and (active is null or active='Y') "
                        "order by fundingDate desc limit 1",
                        caa["corporateId"])
                    if fundingFlag is False and funding is not None:
                        fundingFlag = True

                    pnames.append(company["name"])

        if len(corporate_ids) > 1 and stockFlag is False:

            if len(pnames) >= 2:
                vv = compare(pnames)
            else:
                vv = 0

            (chinese, company) = name_helper.name_check(full_name)
            if chinese is True:
                chinese_type = "Y"
                n5 += 1
                if fundingFlag is True:
                    n3 += 1
                if cfullFlag is True:
                    n4 += 1
                if vv <= 0.75:
                    n7 += 1

            else:
                chinese_type = "N"
                n6 += 1
            #do merge

            n += 1

            logger.info("merge:%s %s-> %s", full_name, corporate_ids,
                        chinese_type)
            mflag = corporate_util.autoMerge(corporate_ids, full_name)
            #
            # if mflag is None:
            #     logger.info("wrong")
            #     exit()
            if mflag == 1:
                n1 += 1
            else:
                n2 += 1

            # elif mflag == 2:
            #     n2 += 1
            # elif mflag == 3:
            #     n3 += 1
            # elif mflag == 4:
            #     n4 += 1
            #     line = "%s+++%s+++%s\n" % (
            #     full_name, ";".join([str(id) for id in corporate_ids]), get_links(corporate_ids))
            #     fp2.write(line)
            # else:
            c1 = "否"
            c2 = "否"
            c3 = "否"
            if len(corporate_ids_f) == 1:
                c1 = "是"
            if len(corporate_ids_f) == len(corporate_ids):
                c2 = "是"
            if len(corporate_ids_f) == 0:
                c3 = "是"

            line = "%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s+++%s\n" % (
                full_name, ";".join([str(id) for id in corporate_ids]),
                get_links(corporate_ids), "中文名" if chinese_type == 'Y' else
                "英文名", "有融资" if fundingFlag is True else "无融资", "公司主要名称一致"
                if cfullFlag is True else "公司别名一致", "短名高度相似" if vv <= 0.75 else
                "短名不相似", "可以根据verify自动聚合" if mflag == 1 else " ", c1, c2, c3)

            # fp2.write(line)
            tline += line
    fp2 = open("me.txt", "w")
    fp2.write(tline)
    logger.info("merge num %s/%s/%s/%s/%s/%s/%s/%s", n, n1, n2, n3, n4, n5, n6,
                n7)
    content = '''<div>Dears,    <br /><br />

        附件是目前系统中存在重复的公司,请在后台搜索
        </div>
        '''
    fp2.close()
    path = os.path.join(sys.path[0], "me.txt")
    logger.info(path)
    email_helper.send_mail_file(
        "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**",
        ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复公司检索--人工审查",
        content, path)
    conn.close()
Ejemplo n.º 4
0
def corp_merge2():
    fp2 = open("me.txt", "w")
    n = 0
    n1 = 0
    n2 = 0
    n3 = 0
    n4 = 0
    conn = db.connect_torndb()
    cnames = conn.query(
        "select fullName,count(*) as cnt from corporate where (active is null or active !='N') "
        "and fullName is not null and fullName!='' group by fullName having cnt>1"
    )

    # cnames = conn.query("select fullName,count(*) as cnt from corporate where (active is null or active !='N') "
    #                     "and fullName='上海中慎网络科技有限公司' group by fullName having cnt>1")
    for cname in cnames:
        full_name = cname["fullName"]
        corporate_ids = []
        stockFlag = False

        if full_name is None or full_name.strip() == "" or full_name.strip() == "-" \
                or full_name.strip() == "个人" or full_name.strip() == "扎堆":
            continue

        corporate_aliases = conn.query(
            "select * from corporate_alias where name=%s and (active is null or active !='N')",
            full_name)
        for caa in corporate_aliases:
            ca = conn.get(
                "select * from corporate where (active is null or active !='N') and id=%s",
                caa["corporateId"])
            if ca is None: continue
            if ca["fullName"] != full_name: continue

            c_stock = conn.get(
                "select * from corporate_stock_exchange_rel where corporateId=%s limit 1",
                ca["id"])
            if c_stock is not None:
                stockFlag = True
                continue

            company = conn.get(
                "select * from company where corporateId=%s and (active is null or active!='N') limit 1",
                ca["id"])
            if company is not None:

                if ca["id"] not in corporate_ids:
                    corporate_ids.append(int(ca["id"]))

        if len(corporate_ids) > 1 and stockFlag is False:
            logger.info("merge:%s-> %s", full_name, corporate_ids)
            #do merge
            n += 1
            mflag = corporate_util.autoMerge(corporate_ids, full_name)

            if mflag is None:
                logger.info("wrong")
                exit()
            if mflag == 1:
                n1 += 1
            elif mflag == 2:
                n2 += 1
            elif mflag == 3:
                n3 += 1
            elif mflag == 4:
                n4 += 1
                line = "%s+++%s+++%s\n" % (full_name, ";".join(
                    [str(id)
                     for id in corporate_ids]), get_links(corporate_ids))
                fp2.write(line)
            else:
                line = "%s+++%s+++%s\n" % (full_name, ";".join(
                    [str(id)
                     for id in corporate_ids]), get_links(corporate_ids))
                fp2.write(line)

    logger.info("merge num %s/%s/%s/%s/%s", n4, n3, n2, n1, n)
    content = '''<div>Dears,    <br /><br />

        附件是目前系统中存在重复的公司,请在后台搜索
        </div>
        '''
    fp2.close()
    path = os.path.join(sys.path[0], "me.txt")
    logger.info(path)
    email_helper.send_mail_file(
        "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**",
        ';'.join([i + '@xiniudata.com' for i in ["celine", "zhlong", "bamy"]]),
        "重复公司检索--人工审查", content, path)
    conn.close()
Ejemplo n.º 5
0
def run_week():
    mongo = db.connect_mongo()
    conn = db.connect_torndb()

    # 获取上周日
    endDate = (datetime.datetime.today() -
               datetime.timedelta(days=time.localtime().tm_wday))
    endDate = datetime.datetime(endDate.year, endDate.month, endDate.day)
    # 获取上周一
    startDate = (datetime.datetime.today() -
                 datetime.timedelta(days=time.localtime().tm_wday + 7))
    startDate = datetime.datetime(startDate.year, startDate.month,
                                  startDate.day)

    # 登录用户和非登录用户分开来查

    result_login = list(
        mongo.log.user_log.find(
            {
                '$and': [{
                    'url_type': 'front'
                }, {
                    "requestURL": {
                        "$regex": "/^\/search/"
                    }
                }, {
                    "user": {
                        "$exists": False
                    }
                }, {
                    'time': {
                        '$gt': startDate
                    }
                }, {
                    'time': {
                        '$lt': endDate
                    }
                }]
            }, {'_id': 0}))

    #TODO

    result_tourist = list(
        mongo.log.user_log.find(
            {
                '$and': [{
                    'url_type': 'front'
                }, {
                    "requestURL": {
                        "$regex": "/^\/search/"
                    }
                }, {
                    "user": {
                        "$exists": True
                    }
                }, {
                    'time': {
                        '$gt': startDate
                    }
                }, {
                    'time': {
                        '$lt': endDate
                    }
                }]
            }, {'_id': 0}))

    #TODO

    import pandas as pd
    df = pd.DataFrame(result)
    uids = [i['userId'] for i in result]

    result = conn.query(
        '''select u.id userId,u.username userName,o.name orgName
    from user u 
    left join user_organization_rel r on r.userId=u.id
    left join organization o on r.organizationId=o.id
    where (r.active='Y' or r.active is null)
    and u.id in %s''', uids)

    df2 = pd.DataFrame(result)
    df3 = pd.merge(df, df2, on='userId', how='left')

    def keyword(x):
        if x.visitURL.find('open/') >= 0:
            keyword = x.visitURL.split('open/')[-1].strip()
        else:
            keyword = ''
        keyword = unquote(keyword.encode())
        return keyword.decode()

    df3['keyword'] = df3.apply(keyword, axis=1)

    df3['specialOrg'] = df3.apply(
        lambda x: ','.join(re.findall(u'烯牛|以太', x.orgName)), axis=1)

    df3 = df3[df3.specialOrg != '烯牛']
    fileName = 'search_weekly_report.xlsx'
    df3.to_excel(
        fileName,
        index=0,
        columns=['visitURL', 'userName', 'orgName', 'ip', 'time', 'keyword'])

    df3 = df3[df3.specialOrg == '']

    content = df3.orgName.value_counts().to_frame()[:10].to_html()
    content = '''<div>Dears,    <br /><br />

    附件是上周的用户搜索记录,搜索量前10的机构为:
    </div>
    ''' + content

    content2 = df3.keyword.value_counts().to_frame()[:100].to_html()
    content2 = '''
    <div>
    <br />
    前100名的搜索词为(统计已过滤掉以太和烯牛成员的搜索数据):
    </div>
    
    ''' + content2

    content = content + content2

    # send_mail_file(from_alias, reply_alias, reply_email, to, subject, content, file)
    # '[email protected];[email protected]',
    recieveList = [
        'zhlong', 'jiaojunpeng', 'avery', 'arthur', 'bamy', 'celine', 'marchy',
        'haiming'
    ]
    # recieveList = ['zhlong']

    path = os.path.join(sys.path[0], fileName)
    email_helper.send_mail_file(
        "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**",
        ';'.join([i + '@xiniudata.com'
                  for i in recieveList]), "上周搜索周报(%s ~ %s)" %
        (startDate.strftime('%Y-%m-%d'),
         (endDate + datetime.timedelta(days=-1)).strftime('%Y-%m-%d')),
        content, path)

    mongo.close()
    conn.close()
Ejemplo n.º 6
0
def send_tzj_email():
    print('this time:%s to send email' % datetime.datetime.now())
    hour = time.localtime()[3]
    mongo = db.connect_mongo()
    collection = mongo.raw.tzj_rz_incr
    if hour == 8:
        items = list(collection.find().sort('createtime', -1).limit(50))
    else:
        date = datetime.date.today().strftime('%Y-%m-%d')
        items = list(collection.find({'date': date}))
    mongo.close()
    cnt = len(items)
    from_alias = 'Hush'
    reply_alias = 'Hush'
    reply_email = '*****@*****.**'
    to = '[email protected];[email protected];[email protected];[email protected];[email protected];[email protected]'
    # to = '[email protected];[email protected];[email protected];[email protected];[email protected];[email protected]'
    print('*******')
    subject = '投资界日常融资事件'
    content = '<html>共<b>%d</b>起融资事件,请查看附件</html>' % cnt
    file = 'tzj_rz_day.xls'
    wb = xlwt.Workbook()
    ws = wb.add_sheet('A Work Sheet', cell_overwrite_ok=True)
    ws.write(0, 0, 'Product')
    ws.write(0, 1, 'Lunci')
    ws.write(0, 2, 'Date')
    ws.write(0, 3, 'Pro_Source')
    ws.write(0, 4, 'Invest_Source')
    ws.write(0, 5, 'Investment')
    i = 1
    for item in items:
        product = item.get('product')
        lunci = item.get('lunci')
        date = item.get('date')
        pro_source = item.get('project_url').decode('utf-8')
        invest_source = item.get('invest_url').decode('utf-8')
        investr = item.get('investr')

        if len(pro_source) > 255:
            sources1 = pro_source
        else:
            n = "HYPERLINK"
            sources1 = xlwt.Formula(n + '("%s";"%s")' %
                                    (pro_source, pro_source))
        if len(invest_source) > 255:
            sources2 = invest_source
        else:
            n = "HYPERLINK"
            sources2 = xlwt.Formula(n + '("%s";"%s")' %
                                    (invest_source, invest_source))
        ws.write(i, 0, product)
        ws.write(i, 1, lunci)
        ws.write(i, 2, date)
        ws.write(i, 3, sources1)
        ws.write(i, 4, sources2)
        ws.write(i, 5, investr)
        i += 1
    wb.save(file)
    email_helper.send_mail_file(from_alias, reply_alias, reply_email, to,
                                subject, content, file)
    print('done')
Ejemplo n.º 7
0
def extract_data(investorId):
    tline = ""
    n = 0
    n1 = 0
    n2 = 0
    n3 = 0
    n4 = 0
    conn = db.connect_torndb()
    mongo = db.connect_mongo()
    collection_gongshang = mongo.info.gongshang

    oaliases = conn.query(
        "select * from investor_alias where (active is null or active='Y') and "
        "(verify is null or verify !='N') and investorId=%s", investorId)

    oanames = [
        alias["name"] for alias in oaliases
        if alias["name"] is not None and alias["type"] == 12010
    ]
    anames = []

    investorfs = conn.query(
        "select * from investor_fund where (active is null or active='Y') and "
        "(verify is null or verify !='N') and investorId=%s;", investorId)
    for investorf in investorfs:
        amacf = "是" if investorf["amacFundId"] is not None else "否"
        of = "是" if investorf["fullName"] in oanames else "否"
        item = collection_gongshang.find_one({'name': investorf["fullName"]})
        if item is not None and item.has_key("invests") and len(
                item["invests"]) > 0:
            numiv = len(item["invests"])
        else:
            numiv = "0"
        line = "%s+++%s+++%s+++%s+++%s\n" % (
            investorf["fullName"], investorf["memo"], amacf, of, numiv)
        tline += line
        if investorf["fullName"] not in anames:
            anames.append(investorf["fullName"])

    investorgs = conn.query(
        "select * from investor_gp where (active is null or active='Y') and "
        "(verify is null or verify !='N') and investorId=%s;", investorId)
    for investorg in investorgs:
        amacf = "是" if investorg["amacManagerId"] is not None else "否"
        of = "是" if investorg["fullName"] in oanames else "否"
        item = collection_gongshang.find_one({'name': investorg["fullName"]})
        if item is not None and item.has_key("invests") and len(
                item["invests"]) > 0:
            numiv = len(item["invests"])
        else:
            numiv = "0"
        line = "%s+++%s+++%s+++%s+++%s\n" % (
            investorg["fullName"], investorg["memo"], amacf, of, numiv)
        tline += line
        if investorg["fullName"] not in anames:
            anames.append(investorg["fullName"])

    tline += "\n\n"

    for oal in oaliases:
        if oal["name"] is None: continue
        if oal["name"] in anames: continue
        if oal["type"] != 12010: continue
        if amac_util.find_amac_manager(
                oal["name"]) is not None or amac_util.find_amac_fund(
                    oal["name"]) is not None:
            amacf = "是"
        else:
            amacf = "否"
        createUser = oal["createUser"] if oal["createUser"] is not None else " "
        item = collection_gongshang.find_one({'name': oal["name"]})
        if item is not None and item.has_key(
                "legalPersonName") and item["legalPersonName"].strip() not in [
                    "", "-", "—"
                ]:
            lp = item["legalPersonName"]
        else:
            lp = " "

        if item is not None and item.has_key("invests") and len(
                item["invests"]) > 0:
            numiv = len(item["invests"])
            ivnames = [inv["name"] for inv in item["invests"]]
            ivnamesstr = ";".join(ivnames)
            ivnamesnn = [
                inv["name"] for inv in item["invests"] if inv["name"] in anames
            ]
            ivnamesnnstr = ";".join(ivnamesnn) if len(ivnamesnn) > 0 else "无"
        else:
            numiv = "0"
            ivnamesstr = "无"
            ivnamesnnstr = "无"

        line = "%s+++%s+++%s+++%s+++%s+++%s+++%s\n" % (
            oal["name"], createUser, amacf, lp, numiv, ivnamesstr,
            ivnamesnnstr)
        tline += line
        mongo.close()

    logger.info("%s - %s - %s", investorId, len(oanames),
                len(investorfs) + len(investorgs))

    fp2 = open("me.txt", "w")
    fp2.write(tline)
    content = '''<div>Dears,    <br /><br />

                附件是目前系统中存在重复的公司,请在后台搜索
                </div>
                '''
    fp2.close()
    path = os.path.join(sys.path[0], "me.txt")
    logger.info(path)
    email_helper.send_mail_file(
        "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**",
        ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复机构检索--人工审查",
        content, path)
    fp2.close()
    conn.close()
Ejemplo n.º 8
0
def dup_alias():
    tline = ""
    n = 0
    n1 = 0
    n2 = 0
    n3 = 0
    n4 = 0
    conn = db.connect_torndb()
    cnames = conn.query(
        "select name,count(*) as cnt from investor_alias where (active is null or active !='N') "
        "and name is not null and name!='' and type=12010 group by name having cnt>1"
    )
    logger.info(len(cnames))
    for cname in cnames:
        investor_ids = []
        investor_ids_un = []
        investor_aids_ver = []
        investor_as = conn.query(
            "select * from investor_alias where name=%s and (active is null or active !='N') and type=12010",
            cname["name"])
        for ia in investor_as:
            investor = conn.get(
                "select * from investor where (active is null or active !='N') and id=%s",
                ia["investorId"])
            if investor is not None:
                investor_ids.append(investor["id"])
                if investor["id"] not in investor_ids_un:
                    investor_ids_un.append(investor["id"])
                if ia["verify"] == "Y":
                    investor_aids_ver.append(ia["id"])

        if len(investor_ids) > 1:
            n += 1
            logger.info("dup:%s -> %s", cname["name"], investor_ids)
            aa = "否"
            ab = "否"
            ac = "否"
            # line = "%s+++%s+++%s\n" % (cname["name"], ";".join([str(id) for id in investor_ids]),get_links(investor_ids))
            # tline += line
            if len(investor_ids_un) == 1:
                logger.info("dup:%s -> %s -- %s", cname["name"], investor_ids,
                            "for same investor")
                aa = "是"
                ssinv = conn.get(
                    "select * from investor_alias where investorId=%s and name=%s and (active is null or active !='N') limit 1",
                    investor_ids_un[0], cname["name"])
                logger.info("here we want to save: %s", ssinv["id"])
                conn.update(
                    "update investor_alias set active='N', modifyUser=-571 where id!=%s and type=12010 and name=%s",
                    ssinv["id"], cname["name"])
                # exit()
                n1 += 1
            if len(investor_aids_ver) == 1:
                logger.info("dup:%s -> %s -- %s %s", cname["name"],
                            investor_ids, "for one verify",
                            investor_aids_ver[0])
                ab = "是"
                conn.update(
                    "update investor_alias set active='N', modifyUser=-571 where id!=%s and type=12010 and name=%s",
                    investor_aids_ver[0], cname["name"])
                # exit()
                n2 += 1
            if len(investor_aids_ver) == 0:
                logger.info("dup:%s -> %s -- %s", cname["name"], investor_ids,
                            "for None verify")
                ac = "是"
                sid = investor_ids[0]
                f = 0
                for iid in investor_ids:
                    iinv = conn.get(
                        "select * from investor where (active is null or active !='N') and id=%s",
                        iid)
                    if iinv["fundingCntFrom2017"] > f:
                        f = iinv["fundingCntFrom2017"]
                        sid = iid
                logger.info("here we want to save: %s", sid)
                conn.update(
                    "update investor_alias set active='N', modifyUser=-571 where investorId!=%s and type=12010 and name=%s",
                    sid, cname["name"])
                # exit()

                n3 += 1
            line = "%s+++%s+++%s+++%s+++%s+++%s\n" % (cname["name"], ";".join(
                [str(id)
                 for id in investor_ids]), get_links(investor_ids), aa, ab, ac)
            tline += line

    logger.info("%s - %s - %s - %s - %s", n, n1, n2, n3, n4)
    fp2 = open("me.txt", "w")
    fp2.write(tline)
    content = '''<div>Dears,    <br /><br />

            附件是目前系统中存在重复的公司,请在后台搜索
            </div>
            '''
    fp2.close()
    path = os.path.join(sys.path[0], "me.txt")
    logger.info(path)
    email_helper.send_mail_file(
        "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**",
        ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复机构检索--人工审查",
        content, path)
    fp2.close()
    conn.close()
Ejemplo n.º 9
0
def kuohao_alias():
    tline = ""
    conn = db.connect_torndb()
    n = 0
    n1 = 0
    n2 = 0
    n3 = 0
    n4 = 0
    # cnames = conn.query("select * from investor_alias where (active is null or active !='N') and name like %s", '%(%')
    cnames = conn.query(
        "select name,count(*) as cnt from investor_alias where (active is null or active !='N') "
        "and (name like %s or name like %s) group by name", '%(%', '%)%')

    for cname in cnames:
        wname = cname["name"]
        investors = conn.query(
            "select * from investor_alias where (active is null or active !='N') and name=%s",
            wname)
        for inv in investors:
            if inv["type"] != 12010: continue
            wid = inv["investorId"]
            investor = conn.get(
                "select * from investor where (active is null or active !='N') and id=%s",
                wid)
            if investor is None: continue
            n1 += 1
            # logger.info("*****************name:%s",inv["name"])
            mnames = [wname.replace("(", "(").replace(")", ")").strip()]
            # csameiid = ""
            investor_ids = []
            for mname in mnames:
                # i0 = conn.get("select * from investor_alias where name=%s and (active is null or active !='N') and "
                #               "investorId=%s limit 1", mname, wid)
                i0 = None
                if i0 is None:
                    i1s = conn.query(
                        "select * from investor_alias where name=%s and (active is null or active !='N')",
                        mname)
                    for i1 in i1s:
                        iv1 = conn.get(
                            "select * from investor where (active is null or active !='N') and id=%s",
                            i1["investorId"])
                        if iv1 is not None and iv1["id"] not in investor_ids:
                            investor_ids.append(iv1["id"])
                else:
                    if wid not in investor_ids:
                        investor_ids.append(wid)

            if len(investor_ids) > 0:
                if wid in investor_ids and len(investor_ids) == 1:
                    csameiid = "同一机构"
                    n2 += 1
                    conn.update(
                        "update investor_alias set active='N',modifyUser=-561 where id=%s",
                        inv["id"])
                else:
                    csameiid = "多个机构"
                    n3 += 1
                    line = "%s+++%s+++%s\n" % (cname["name"], ";".join([
                        str(id) for id in [str(wid)] + investor_ids
                    ]), get_links([str(wid)] + investor_ids))
                    tline += line
                logger.info("%s - %s - %s - %s", wname, str(wid),
                            ";".join([str(id) for id in investor_ids]),
                            csameiid)
                n += 1
            else:
                (chinese, cccompany) = name_helper.name_check(mnames[0])
                if chinese is True:
                    n4 += 1
                    logger.info("update!!!!!")
                    conn.update(
                        "update investor_alias set name=%s,modifyUser=-561 where id=%s",
                        mnames[0], inv["id"])
    logger.info("%s - %s - %s - %s - %s", n, n1, n2, n3, n4)

    fp2 = open("me.txt", "w")
    fp2.write(tline)
    content = '''<div>Dears,    <br /><br />

                附件是目前系统中存在重复的公司,请在后台搜索
                </div>
                '''
    fp2.close()
    path = os.path.join(sys.path[0], "me.txt")
    logger.info(path)
    email_helper.send_mail_file(
        "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**",
        ';'.join([i + '@xiniudata.com' for i in ["bamy"]]), "重复机构检索--人工审查",
        content, path)
    fp2.close()
    conn.close()
Ejemplo n.º 10
0
def run_week():
    mongo = db.connect_mongo()
    conn = db.connect_torndb()

    # 获取上周日
    endDate = (datetime.datetime.today() -
               datetime.timedelta(days=time.localtime().tm_wday))
    endDate = datetime.datetime(endDate.year, endDate.month, endDate.day)
    # 获取上周一
    startDate = (datetime.datetime.today() -
                 datetime.timedelta(days=time.localtime().tm_wday + 7))
    startDate = datetime.datetime(startDate.year, startDate.month,
                                  startDate.day)

    result = list(
        mongo.log.page_view.find(
            {
                '$and': [{
                    'router': 'search'
                }, {
                    'time': {
                        '$gt': startDate - datetime.timedelta(hours=8)
                    }
                }, {
                    'time': {
                        '$lt': endDate - datetime.timedelta(hours=8)
                    }
                }]
            }, {'_id': 0}))
    import pandas as pd
    df = pd.DataFrame(result)
    df['time2'] = df.apply(lambda x: x.time + datetime.timedelta(hours=8),
                           axis=1)
    uids = [i['userId'] for i in result]

    result = conn.query(
        '''select u.id userId,u.username userName,o.name orgName
    from user u 
    left join user_organization_rel r on r.userId=u.id
    left join organization o on r.organizationId=o.id
    where (r.active='Y' or r.active is null)
    and u.id in %s''', uids)

    df2 = pd.DataFrame(result)
    df3 = pd.merge(df, df2, on='userId', how='left')

    def keyword(x):
        if x.visitURL.find('open/') >= 0:
            keyword = x.visitURL.split('open/')[-1].strip()
        else:
            keyword = ''
        keyword = unquote(keyword.encode())
        return keyword.decode()

    df3['keyword'] = df3.apply(keyword, axis=1)

    df3['specialOrg'] = df3.apply(
        lambda x: ','.join(re.findall(u'烯牛|以太', x.orgName))
        if pd.notnull(x.orgName) else '',
        axis=1)

    df3 = df3[df3.specialOrg != '烯牛']

    for c in df3.columns:

        def illegal(row):
            import re
            content = row[c]
            if content is not None:
                ILLEGAL_CHARACTERS_RE = re.compile(
                    r'[\000-\010]|[\013-\014]|[\016-\037]')
                # print 'content:',c,content
                try:
                    content = ILLEGAL_CHARACTERS_RE.sub(r'', content)
                except:
                    pass
            return content

        # print 'c:',c
        df3[c] = df3.apply(illegal, axis=1)

    fileName = 'search_weekly_report.xlsx'
    df3.to_excel(
        fileName,
        index=0,
        columns=['visitURL', 'userName', 'orgName', 'ip', 'time2', 'keyword'])

    df3 = df3[df3.specialOrg == '']

    content = df3.orgName.value_counts().to_frame()[:10].to_html()
    content = '''<div>Dears,    <br /><br />

    附件是上周的用户搜索记录,搜索量前10的机构为:
    </div>
    ''' + content

    content2 = df3.keyword.value_counts().to_frame()[:100].to_html()
    content2 = '''
    <div>
    <br />
    前100名的搜索词为(统计已过滤掉以太和烯牛成员的搜索数据):
    </div>
    
    ''' + content2

    content = content + content2

    # send_mail_file(from_alias, reply_alias, reply_email, to, subject, content, file)
    # '[email protected];[email protected]',
    recieveList = [
        'avery', 'arthur', 'marchy', 'weiguangxiao', 'jiaojunpeng',
        'charlotte', 'erin', 'jinglei', 'zhlong', 'bamy'
    ]
    # recieveList = ['zhlong','jiaojunpeng']

    path = os.path.join(sys.path[0], fileName)
    email_helper.send_mail_file(
        "烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**",
        ';'.join([i + '@xiniudata.com'
                  for i in recieveList]), "机构版(pro)上周搜索周报(%s ~ %s)" %
        (startDate.strftime('%Y-%m-%d'),
         (endDate + datetime.timedelta(days=-1)).strftime('%Y-%m-%d')),
        content, path)

    mongo.close()
    conn.close()
Ejemplo n.º 11
0
def run_week():
    mongo = db.connect_mongo()
    conn = db.connect_torndb()

    # 获取上周日
    endDate = (datetime.datetime.today() - datetime.timedelta(days=time.localtime().tm_wday))
    endDate = datetime.datetime(endDate.year, endDate.month, endDate.day)
    # 获取上周一
    startDate = (datetime.datetime.today() - datetime.timedelta(days=time.localtime().tm_wday + 7))
    startDate = datetime.datetime(startDate.year, startDate.month, startDate.day)

    result = list(
        mongo.log.user_log.find(
            {'$and': [{'url_type': 'front'}, {'requestURL': {'$regex': '/search'}},
                      {'time': {'$gt': startDate - datetime.timedelta(hours=8)}},
                      {'time': {'$lt': endDate - datetime.timedelta(hours=8)}}]}, {'_id': 0}))

    import pandas as pd
    df = pd.DataFrame(result)
    df['time2']=df.apply(lambda x:x.time + datetime.timedelta(hours=8),axis=1)

    uids = [i.get('userId') for i in result]

    result = conn.query('''select u.id userId,u.username userName,o.name orgName
    from user u 
    left join user_organization_rel r on r.userId=u.id
    left join organization o on r.organizationId=o.id
    where (r.active='Y' or r.active is null)
    and u.id in %s''', uids)

    df2 = pd.DataFrame(result)
    df3 = pd.merge(df, df2, on='userId', how='left')

    def keyword(x):
        if x.requestURL.find('search') >= 0:
            # keyword = x.requestURL.split('search/')[-1].split('&&name=')[-1].strip()
            keyword = x.requestURL.split('search/')[-1].split('&&name=')[-1].split('/search?name=')[-1].strip()
        else:
            keyword = ''
        keyword = unquote(keyword.encode())

        try:
            keyword = keyword.decode()
        except:
            keyword = ''
        return keyword

    df3['keyword'] = df3.apply(keyword, axis=1)
    # df3['keyword'] = df3.apply(lambda x: '(空搜索)' if pd.isnull(x.keyword) or x.keyword in ['/search', ''] else x.keyword,
    #                            axis=1)

    df3['specialOrg'] = df3.apply(lambda x: ','.join(re.findall(u'烯牛|以太', x.orgName)) if pd.notnull(x.orgName) else '',
                                  axis=1)

    # df3 = df3[df3.specialOrg != '烯牛']

    for c in df3.columns:
        def illegal(row):
            import re
            content = row[c]
            if content is not None:
                ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
                # print 'content:',c,content
                try:
                    content = ILLEGAL_CHARACTERS_RE.sub(r'', content)
                except:
                    pass
            return content

        # print 'c:',c
        df3[c] = df3.apply(illegal, axis=1)

    fileName = 'personal_search_weekly_report.xlsx'
    df3.to_excel(fileName, index=0, columns=['requestURL', 'userName', 'orgName', 'ip', 'time2', 'keyword'])

    hs=conn.query('''select * from hot_search limit 10''')
    hsString,updateTime=','.join([i['name'] for i in hs]),hs[0]['modifyTime']
    # df3 = df3[df3.specialOrg == '']

    content2 = df3.keyword.value_counts().to_frame()[:100].to_html()
    content2 = '''
    <div>
    <div>Dears,    <br /><br />
    附件是上周个人版的用户搜索记录:
    <br /><br />
    1、上周个人版用户总计搜索了 <b>%s</b> 次
    <br /><br />
    2、这一周的热门搜索词是:<b>%s</b>;更新时间:%s
    <br /><br />
    3、前100名的搜索词为:
    </div>

    ''' % (df3.count()['code'],hsString,updateTime) + content2

    content = content2

    # send_mail_file(from_alias, reply_alias, reply_email, to, subject, content, file)
    # '[email protected];[email protected]',
    # Avery, Arthur, Marchy, 广肖, 小娇, Charlotte, 刘林, 荆雷
    recieveList = ['avery', 'arthur', 'marchy', 'weiguangxiao', 'jiaojunpeng', 'charlotte', 'erin', 'jinglei', 'zhlong',
                   'bamy']
    # recieveList = ['zhlong']   #todo

    path = os.path.join(sys.path[0], fileName)
    email_helper.send_mail_file("烯牛数据数据开发组", "烯牛数据数据开发组", "*****@*****.**",
                                ';'.join([i + '@xiniudata.com' for i in recieveList]),
                                "个人版上周搜索周报(%s ~ %s)" % (startDate.strftime('%Y-%m-%d'),
                                                        (endDate + datetime.timedelta(days=-1)).strftime('%Y-%m-%d')
                                                        ),
                                content, path)

    mongo.close()
    conn.close()
Ejemplo n.º 12
0
def process_one(org_id, thedate=None, test=True):
    if thedate is None:
        today = datetime.datetime.now()
    else:
        today = thedate

    # 获取上周六
    startDate = (today - datetime.timedelta(days=time.localtime().tm_wday + 2))
    start_time = datetime.datetime(startDate.year, startDate.month,
                                   startDate.day, 21)
    # 获取这周六
    endDate = (today - datetime.timedelta(days=time.localtime().tm_wday - 5))
    end_time = datetime.datetime(endDate.year, endDate.month, endDate.day, 21)

    conn = db.connect_torndb()
    mongo = db.connect_mongo()

    df, _ = data_code.run(conn, mongo, start_time.strftime("%Y-%m-%d"),
                          end_time.strftime("%Y-%m-%d"))
    df = df[(df.publishDateMerge >= start_time)
            & (df.publishDateMerge < end_time)]

    nameMap = {}
    string = u'''首次披露时间 项目名称	领域	是否国内	一句话简介	完整简介	融资详情
publishDateMerge    companyName	sector	location	brief	description	investmentDetail'''
    stringrows = string.split('\n')
    index = 0
    for column in stringrows[1].split():
        nameMap[column] = stringrows[0].split()[index]
        index += 1

    df = df.rename(columns=nameMap)

    title = "烯牛数据融资事件表(%s ~ %s)" % (start_time.strftime("%m-%d"),
                                    end_time.strftime("%m-%d"))
    fileName = "funding (%s ~ %s).xlsx" % (start_time.strftime("%m-%d"),
                                           end_time.strftime("%m-%d"))

    from openpyxl import load_workbook
    import pandas as pd

    writer = pd.ExcelWriter(fileName, engin='openpyxl')
    book = load_workbook('template/template.xlsx')

    ws = book.active
    ws['b9'] = u'数据包含了%s至%s一周的国内外融资事件。' % (start_time.strftime("%Y年%m月%d日"),
                                           end_time.strftime("%Y年%m月%d日"))
    writer.book = book
    df.to_excel(excel_writer=writer,
                sheet_name=u"数据明细",
                index=0,
                columns=stringrows[0].split())
    writer.save()
    writer.close()

    path = '/data/task-201606/spider2/aggregator/funding'
    path = sys.path[0]
    path = os.path.join(path, fileName)

    content = '''Hello,<br /><br />
以下是本周(%s ~ %s)披露的国内外投融资事件列表,请查收!''' % (start_time.strftime("%m-%d"),
                                       end_time.strftime("%m-%d"))

    content = '''<!DOCTYPE html><html xmlns="http://www.w3.org/1999/xhtml" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office"><head>  <title></title>  <!--[if !mso]><!-- -->  <meta http-equiv="X-UA-Compatible" content="IE=edge">  <!--<![endif]--><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0"><style type="text/css">  #outlook a { padding: 0; }  .ReadMsgBody { width: 100%; }  .ExternalClass { width: 100%; }  .ExternalClass * { line-height:100%; }  body { margin: 0; padding: 0; -webkit-text-size-adjust: 100%; -ms-text-size-adjust: 100%; }  table, td { border-collapse:collapse; mso-table-lspace: 0pt; mso-table-rspace: 0pt; }  img { border: 0; height: auto; line-height: 100%; outline: none; text-decoration: none; -ms-interpolation-mode: bicubic; }  p { display: block; margin: 13px 0; }</style><!--[if !mso]><!--><style type="text/css">  @media only screen and (max-width:480px) {    @-ms-viewport { width:320px; }    @viewport { width:320px; }  }</style><!--<![endif]--><!--[if mso]><xml>  <o:OfficeDocumentSettings>    <o:AllowPNG/>    <o:PixelsPerInch>96</o:PixelsPerInch>  </o:OfficeDocumentSettings></xml><![endif]--><!--[if lte mso 11]><style type="text/css">  .outlook-group-fix {    width:100% !important;  }</style><![endif]--><!--[if !mso]><!-->    <link href="https://fonts.googleapis.com/css?family=Ubuntu:300,400,500,700" rel="stylesheet" type="text/css">    <style type="text/css">        @import url(https://fonts.googleapis.com/css?family=Ubuntu:300,400,500,700);    </style>  <!--<![endif]--><style type="text/css">  @media only screen and (min-width:480px) {    .mj-column-per-100 { width:100%!important; }  }</style></head><body style="background: #FFFFFF;">    <div class="mj-container" style="background-color:#FFFFFF;"><!--[if mso | IE]>      <table role="presentation" border="0" cellpadding="0" cellspacing="0" width="600" align="center" style="width:600px;">        <tr>          <td style="line-height:0px;font-size:0px;mso-line-height-rule:exactly;">      <![endif]--><div style="margin:0px auto;max-width:600px;background:&#x4EE5;&#x4E0A;&#x4E3A;&#x672C;&#x6B21;&#x8FFD;&#x8E2A;&#x5185;&#x5BB9;&#x3002;  &#x5982;&#x6709;&#x7591;&#x95EE;&#xFF0C;&#x6B22;&#x8FCE;&#x8054;&#x7CFB;&#x6211;&#x4EEC;&#xFF1A;&#xFF09;  &#x70EF;&#x725B;&#x6570;&#x636E;&#x56E2;&#x961F;  www.xiniudata.com;"><table role="presentation" cellpadding="0" cellspacing="0" style="font-size:0px;width:100%;background:&#x4EE5;&#x4E0A;&#x4E3A;&#x672C;&#x6B21;&#x8FFD;&#x8E2A;&#x5185;&#x5BB9;&#x3002;  &#x5982;&#x6709;&#x7591;&#x95EE;&#xFF0C;&#x6B22;&#x8FCE;&#x8054;&#x7CFB;&#x6211;&#x4EEC;&#xFF1A;&#xFF09;  &#x70EF;&#x725B;&#x6570;&#x636E;&#x56E2;&#x961F;  www.xiniudata.com;" align="center" border="0"><tbody><tr><td style="text-align:center;vertical-align:top;direction:ltr;font-size:0px;padding:9px 0px 9px 0px;"><!--[if mso | IE]>      <table role="presentation" border="0" cellpadding="0" cellspacing="0">        <tr>          <td style="vertical-align:top;width:600px;">      <![endif]--><div class="mj-column-per-100 outlook-group-fix" style="vertical-align:top;display:inline-block;direction:ltr;font-size:13px;text-align:left;width:100%;"><table role="presentation" cellpadding="0" cellspacing="0" width="100%" border="0"><tbody><tr><td style="word-wrap:break-word;font-size:0px;padding:0px 20px 0px 20px;" align="center"><div style="cursor:auto;color:#000000;font-family:Ubuntu, Helvetica, Arial, sans-serif;font-size:11px;line-height:22px;text-align:center;"><p>Hi&#xA0;&#xFF0C;</p><p>&#x9644;&#x4EF6;&#x662F;&#x672C;&#x5468;&#x62AB;&#x9732;&#x7684;&#x56FD;&#x5185;&#x5916;&#x6295;&#x878D;&#x8D44;&#x4E8B;&#x4EF6;&#x5217;&#x8868;&#xFF0C;&#x8BF7;&#x67E5;&#x6536;&#xFF01;</p><p></p><p></p><p>&#x5982;&#x6709;&#x7591;&#x95EE;&#xFF0C;&#x6B22;&#x8FCE;&#x8054;&#x7CFB;&#x6211;&#x4EEC;&#xFF1A;&#xFF09;</p><p>&#x70EF;&#x725B;&#x6570;&#x636E;&#x56E2;&#x961F;</p><p><a href="http://sctrack.sc.gg/track/click/eyJtYWlsbGlzdF9pZCI6IDAsICJ0YXNrX2lkIjogIiIsICJlbWFpbF9pZCI6ICIxNTMwMTgzNjU0NDk1XzYwMTE0XzgxNjRfNDczOS5zYy0xMF85XzRfNDAtaW5ib3VuZDAkYXJ0aHVyQHhpbml1ZGF0YS5jb20iLCAic2lnbiI6ICJkNWQ5MjZhM2I3YWM3M2E2NDQwMTMwYzRlZjUzYTg1NiIsICJ1c2VyX2hlYWRlcnMiOiB7fSwgImxhYmVsIjogMCwgImxpbmsiOiAiaHR0cCUzQS8vd3d3Lnhpbml1ZGF0YS5jb20iLCAidXNlcl9pZCI6IDYwMTE0LCAiY2F0ZWdvcnlfaWQiOiAxMTI1OTh9.html" target="_blank">www.xiniudata.com</a></p><p><img src="http://www.xiniudata.com/resources/image/icon-system/verify/ios-normal.jpeg"></p><p></p></div></td></tr></tbody></table></div><!--[if mso | IE]>      </td></tr></table>      <![endif]--></td></tr></tbody></table></div><!--[if mso | IE]>      </td></tr></table>      <![endif]--></div></body></html>
    '''

    users = conn.query(
        "select * from org_track_user "
        "where active='Y' and orgId=%s", org_id)

    for user in users:
        if user["email"] is None or user["email"].strip() == "":
            continue
        if test is True:
            if user["email"] not in ["*****@*****.**"]:
                continue
        logger.info("%s", user["email"])
        # email_helper.send_mail("烯牛数据","烯牛数据", "*****@*****.**", user["email"], title, content)

        email_helper.send_mail_file("烯牛数据", "烯牛数据", "*****@*****.**",
                                    user["email"], title, content, path)
        # pass

    conn.close()
    mongo.close()