def process():
    logger.info("36kr_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        # items = [parser_db_util.find_process_one(SOURCE,TYPE,9258)]
        for item in items:
            # try:

            source_company_id = parser_db_util.get_company(SOURCE, item["key"])
            logger.info("sourcecid: %s", source_company_id)
            parseMember_save(source_company_id["id"], item, download_crawler)

            # except Exception, E:
            #     logger.info(E)
            #     pass
            # if flag:
            parser_db_util.update_processed(item["_id"])
            logger.info("processed %s", item["url"])

            conn = db.connect_torndb()
            conn.update(
                "update source_company set processStatus=1 where id=%s",
                source_company_id["id"])
            conn.close()
            # else:
            #     logger.info("lack something:  %s", item["url"])

            #break
        # break
        if len(items) == 0:
            break

    logger.info("36kr_company_parser end.")
Beispiel #2
0
def process():
    logger.info("36kr_investor_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        # items = [parser_db_util.find_process_one(SOURCE,TYPE,18)]
        for item in items:
            r = parse_investor(item)
            logger.info(json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            source_investor_id = parser_db_util.save_investor_standard_new(r, download_crawler)

            if len(r["addresses"]) > 0 :
                parser_db_util.save_investor_contact_standard(source_investor_id, r["addresses"])


            parseMember_save(source_investor_id, item, download_crawler)


            parser_db_util.update_processed(item["_id"])
            logger.info("processed %s" ,item["url"])
            # break
        # break
        if len(items) == 0:
            break

    logger.info("36kr_investor_parser end.")
Beispiel #3
0
def process():
    logger.info("itjuzi_news_parser begin...")
    items = parser_db_util.find_process(SOURCE, TYPE)
    for item in items:
        logger.info(item["key_int"])
        logger.info(item["url"])
        flag = parser(item)
        if flag:
            parser_db_util.update_processed(item["_id"])
        #break
    logger.info("itjuzi_news_parser end.")
Beispiel #4
0
def process():
    logger.info("Chuangyepu_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        #items = [parser_db_util.find_process_one(SOURCE, TYPE, 1)]

        for item in items:
            #if item['key_int'] != 1:
            #    continue
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            if r["status"] == "No_Data" or r["status"] == "No_Name":
                parser_db_util.update_active(SOURCE, item["key"], 'N')
                parser_db_util.update_processed(item["_id"])
                logger.info("No infos for %s", item["url"])
                exit()
                continue

            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["name"], 12020)

            logger.info("source_company_id=%s", source_company_id)

            artifacts = []
            artifacts.extend(r["artifacts"])
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)

            parser_db_util.delete_funding(source_company_id)
            flag = parseFinance_save(source_company_id, r['fundings'],
                                     download_crawler)
            if flag:
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
            else:
                logger.info("lack something:  %s", item["url"])
                exit()

        break

    logger.info("Chuangyepu_company_parser end.")
def process():
    logger.info("itjuzi_investorfirm_parser begin...")
    items = parser_db_util.find_process(SOURCE, TYPE)
    for item in items:
        logger.info(item["key"])
        logger.info(item["url"])
        r = parser(item)
        if r is None:
            continue
        parser_db_util.save_investfirm(r, SOURCE, download_crawler)
        parser_db_util.update_processed(item["_id"])
    logger.info("itjuzi_investorfirm_parser end.")
Beispiel #6
0
def process():
    logger.info("itjuzi_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        # items = [parser_db_util.find_process_one(SOURCE, TYPE, 33045986)]
        for item in items:
            logger.info(item["url"])

            r = parse_base(item)
            if r is None:
                continue
            source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id, r["shortName"],12020)
            parser_db_util.save_source_company_name(source_company_id, r["productName"],12020)
            if r["fullName"] is not None:
                parser_db_util.save_source_company_name(source_company_id, r["fullName"],12010)
                main_company_name = name_helper.get_main_company_name(r["fullName"])
                if main_company_name != r["fullName"]:
                    parser_db_util.save_source_company_name(source_company_id, main_company_name,12010)

            logger.info("source_company_id=%s", source_company_id)

            artifacts = parse_artifact(item)
            flag = False
            if len(artifacts) > 0:
                flag = True

            artifacts.extend(r["artifacts"])
            logger.info(artifacts)
            parser_db_util.save_artifacts(source_company_id, artifacts)

            footprints = parse_footprint(item)
            parser_db_util.save_footprints(source_company_id, footprints)

            # members = parse_member(item)
            # parser_db_util.save_member_rels(source_company_id, members, SOURCE)
            parseMember_save(source_company_id, item, download_crawler)

            parser_db_util.update_processed(item["_id"])

            #if flag:
        # break
        start += 1000
        if len(items) == 0:
            break

    logger.info("itjuzi_company_parser end.")
Beispiel #7
0
def process():
    logger.info("Chuangyepu_investfirm_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)

        for item in items:

            r = parse_investor(item)
            if r is not None:
                parser_db_util.save_investor_standard(r, download_crawler)
            parser_db_util.update_processed(item["_id"])

        #break
        if len(items) == 0:
            break

    logger.info("Chuangyepu_investfirm_parser end.")
Beispiel #8
0
def process():
    logger.info("Demo8_next_parser begin...")

    items = parser_db_util.find_process(SOURCE, TYPE)

    for item in items:
        logger.info(item["url"])
        r = parse_base(item)
        if r is None:
            continue
        #logger.info(r)
        source_company_id = parser_db_util.save_company(r, SOURCE)
        logger.info("source_company_id=%s", source_company_id)

        parser_db_util.save_company_score(source_company_id, r["score"])
        parser_db_util.save_artifacts(source_company_id, r["artifacts"])

        parser_db_util.update_processed(item["_id"])
        #break

    logger.info("Demo8_next_parser end.")
Beispiel #9
0
def process():
    logger.info("itjuzi_next_parser begin...")

    items = parser_db_util.find_process(SOURCE, TYPE)

    for item in items:
        logger.info(item["url"])

        r = parse_base(item)
        if r is None:
            continue
        #logger.info(r)
        source_company_id = parser_db_util.save_company(
            r, SOURCE, download_crawler)
        logger.info("source_company_id=%s", source_company_id)

        parser_db_util.save_company_score(source_company_id, r["score"])

        artifacts = []
        for artifact in r["artifacts"]:
            link = artifact["link"]
            type, app_market, app_id = url_helper.get_market(link)
            if type is None:
                continue
            if type == 4040 or type == 4050:
                if app_id is None:
                    continue
            artifact["type"] = type
            artifact["domain"] = app_id
            artifacts.append(artifact)

        parser_db_util.save_artifacts(source_company_id, artifacts)

        parser_db_util.update_processed(item["_id"])
        #break

    logger.info("itjuzi_next_parser end.")
Beispiel #10
0
def process():
    logger.info("36kr_next_parser begin...")

    items = parser_db_util.find_process(SOURCE, TYPE)

    for item in items:
        logger.info(item["url"])

        r = parse_base(item)
        if r is None:
            continue
        #logger.info(r)
        try:
            source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler)
            logger.info("source_company_id=%s", source_company_id)

            parser_db_util.save_company_score(source_company_id, r["score"])
            parser_db_util.save_artifacts(source_company_id, r["artifacts"])

            parser_db_util.update_processed(item["_id"])
            #break
        except Exception,ex:
            logger.info(ex)
            continue
Beispiel #11
0
def process():
    logger.info("itjuzi_funding_parser2 begin...")

    items = parser_db_util.find_process(SOURCE, TYPE)
    # items = [parser_db_util.find_process_one(SOURCE, TYPE, 9551657)]

    for item in items:
        logger.info(item["url"])

        f = parse(item)
        if f is None:
            continue
        if f == -1:
            parser_db_util.update_processed(item["_id"])
            continue

        flag, source_funding_id = parser_db_util.save_funding(f, 13030)
        if flag:
            # pass
            parser_db_util.update_processed(item["_id"])

        # break
    logger.info("itjuzi_funding_parser2 end.")
    logger.info(nokeys)
Beispiel #12
0
def process():
    skip = 0
    limit = 1000

    num = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, skip, 1000)
        #items = [parser_db_util.find_process_one(SOURCE, TYPE, 2310299181)]
        #items = [parser_db_util.find_process_one(SOURCE, TYPE, 1257527760)]

        skip += limit
        finish = True
        for c in items:
            #finish = False
            num += 1
            if c.has_key("exist") and c["exist"] is False:
                logger.info(c["key"])
                parser_db_util.update_processed(c["_id"])

            if c["content"] is None:
                logger.info(c["key"])
                parser_db_util.update_processed(c["_id"])
                continue
            if c["content"]["data"] is None:
                logger.info(c["key"])
                parser_db_util.update_processed(c["_id"])
                continue

            base = c["content"]["data"]["baseInfo"]
            if base.get("regStatus") is None:
                logger.info(c["key"])
                parser_db_util.update_processed(c["_id"])
                continue

            logger.info("%s: %s" % (num, c["key"]))

            gongshang = {
                "name": base["name"],
                "regCapital": base.get("regCapital"),
                "industry": base.get("industry"),
                "regInstitute": base.get("regInstitute"),
                "establishTime": from1970todate(base.get("estiblishTime")),
                "base": base.get("base"),
                "regNumber": base.get("regNumber"),
                "regStatus": base.get("regStatus"),
                "fromTime": from1970todate(base.get("fromTime")),
                "toTime": from1970todate(base.get("toTime")),
                "businessScope": base.get("businessScope"),
                "regLocation": base.get("regLocation"),
                "companyOrgType": base.get("companyOrgType"),
                "legalPersonId": base.get("legalPersonId"),
                "legalPersonName": base.get("legalPersonName")
            }
            investors = []
            if c["content"]["data"].has_key("investorList"):
                investorlist = c["content"]["data"]["investorList"]
                #logger.info(len(investorlist))
                for i in investorlist:
                    investor_info = {}
                    investor_info["type"] = investor_type_map.get(
                        i.get("type"), "")
                    investor_info["name"] = i.get("name")

                    investors.append(investor_info)

            members = []
            if c["content"]["data"].has_key("staffList"):
                memberlist = c["content"]["data"]["staffList"]
                for m in memberlist:
                    member_info = {}
                    member_info["name"] = m.get("name")
                    member_info["position"] = ",".join(
                        list(set(m.get("typeJoin"))))

                    members.append(member_info)

            changinfo = []
            if c["content"]["data"].has_key("comChanInfoList"):
                changinfo = c["content"]["data"]["comChanInfoList"]

            invests = []
            if c["content"]["data"].has_key("investList"):
                investlist = c["content"]["data"]["investList"]
                for v in investlist:
                    if not v.has_key("name"):
                        continue
                    data = {"name": v["name"]}
                    invests.append(data)

            gongshang["members"] = members
            gongshang["investors"] = investors
            gongshang["changeInfo"] = changinfo
            gongshang["invests"] = invests

            logger.info(
                json.dumps(gongshang,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))

            save_collection_goshang(collection_goshang, gongshang)
            save_collection_goshang_his(collection_goshang_history, gongshang)
            parser_db_util.update_processed(c["_id"])
            logger.info("processed %s", c["key"])

        if len(items) == 0:
            break
Beispiel #13
0
def process():
    skip = 0
    limit = 1000

    num = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, limit)
        # items = [parser_db_util.find_process_one_key(SOURCE, TYPE, u'哎哎信息科技(上海)有限公司')]
        # items = [parser_db_util.find_process_one_key(SOURCE, TYPE, u'行吟信息科技(上海)有限公司')]
        # items = [parser_db_util.find_process_one_key(SOURCE, TYPE, u'深圳市加推科技有限公司')]

        # skip += limit
        for c in items:
            num += 1

            logger.info("%s: %s" % (num, c["key"]))
            gongshang = {'name': c["key"]}

            if c['content'].has_key('IC_info') and len(
                    c['content']['IC_info']) > 0:
                base = c['content']['IC_info']

                def get_date(key):
                    if base.get(key) is not None and base.get(key) != "-":
                        try:
                            result = datetime.datetime.strptime(
                                base.get(key), '%Y-%m-%d')
                        except:
                            result = None
                    else:
                        result = None

                    return result

                toTime = get_date('term_end')
                fromTime = get_date('term_start')

                try:
                    address = base.get("addresses")[0]['address']
                except:
                    address = None

                baseInfo = {
                    "name": base["name"],
                    "regCapital": base.get("regist_capital"),
                    # "industry": base.get("industry"),
                    # "regInstitute": base.get("regInstitute"),
                    "establishTime": get_date('start_date'),
                    # "base": base.get("base"),
                    "regNumber": base.get("reg_no"),
                    "regStatus": base.get("status"),
                    "fromTime": fromTime,
                    "toTime": toTime,
                    "businessScope": base.get("scope"),
                    "regLocation": address,
                    "companyOrgType": base.get("kind"),
                    # "legalPersonId": base.get("legalPersonId"),
                    "legalPersonName": base.get("legal_person")
                }
                # gongshang.update(baseInfo)
                record = collection_goshang.find_one({"name": c["key"]})
                for key in baseInfo:
                    if record is None:
                        gongshang[key] = baseInfo[key]
                    else:
                        if baseInfo[key] is None and record.has_key(key):
                            logger.info("%s is None, don't update" % key)
                        else:
                            gongshang[key] = baseInfo[key]
            else:
                record = collection_goshang.find_one({"name": c["key"]})
                if record is None:
                    logger.info(
                        "No gongshang data before for this missing registinfo company: %s",
                        c["key"])
                    parser_db_util.update_processed(c["_id"])
                    continue

            if c['content'].has_key('partners') and len(
                    c['content']['partners']) > 0:
                investors = []
                investorlist = c["content"]["partners"]
                # logger.info(len(investorlist))
                real_capitals_total, should_capitals_total = 0, 0
                for i in investorlist:
                    real_capitals = i.get('real_capitals')
                    should_capitals = i.get('should_capitals')
                    if real_capitals:
                        amount = real_capitals[0].get('amount', -666)
                        if amount != -666 and amount != '-' and amount.strip(
                        ) not in [u'万人民币', u'万美元']:
                            if amount.find(u'万') < 0: continue
                            amount = amount.split(u'万')[0]
                            real_capitals_total += float(amount)
                        else:
                            logger.info(
                                '%s has no capital amount, stop calculating rate.',
                                c['key'])
                            real_capitals_total = -999999999

                    if should_capitals:
                        amount = should_capitals[0].get('amount', -666)
                        if amount != -666 and amount != '-' and amount.strip(
                        ) not in [u'万人民币', u'万美元']:
                            if amount.find(u'万') < 0: continue
                            amount = amount.split(u'万')[0]

                            should_capitals_total += float(amount)
                        else:
                            logger.info(
                                '%s has no capital amount, stop calculating rate.',
                                c['key'])
                            should_capitals_total = -999999999
                            break

                for i in investorlist:
                    investor_info = {}
                    investor_info["type"] = i.get("kind")
                    investor_info["name"] = i.get("name").replace("(",
                                                                  "(").replace(
                                                                      ")", ")")
                    real_capitals = i.get("real_capitals")
                    for capital in real_capitals:
                        if capital.has_key('amount') and capital[
                                'amount'].strip() not in [u'万人民币', u'万美元']:
                            if capital['amount'].find(u'万') < 0: continue
                            amount = capital['amount'].split(u'万')[0]
                            if amount != '-':
                                amount = float(amount)
                                rate = '%s%%' % (int(
                                    round(amount / real_capitals_total, 2) *
                                    100)) if real_capitals_total > 0 else '-'
                            else:
                                rate = '-'
                            capital['rate'] = rate
                    investor_info["real_capitals"] = real_capitals

                    should_capitals = i.get("should_capitals")
                    for capital in should_capitals:
                        if capital.has_key('amount') and capital[
                                'amount'].strip() not in [u'万人民币', u'万美元']:
                            if capital['amount'].find(u'万') < 0: continue
                            amount = capital['amount'].split(u'万')[0]
                            if amount != '-':
                                amount = float(amount)
                                rate = '%s%%' % (int(
                                    round(amount / should_capitals_total, 2) *
                                    100)) if should_capitals_total > 0 else '-'
                            else:
                                rate = '-'
                            capital['rate'] = rate
                    investor_info["should_capitals"] = should_capitals

                    investors.append(investor_info)
                    if investor_info["name"] is not None and investor_info[
                            "name"] != '' and (
                                investor_info["type"].find('企业') >= 0
                                or investor_info["type"].find('公司') >= 0):
                        add_gongshang_name(investor_info["name"])

                gongshang["investors"] = investors

            members = []
            if c["content"].has_key("managers") and len(
                    c['content']['managers']) > 0:
                memberlist = c["content"]["managers"]
                for m in memberlist:
                    member_info = {}
                    member_info["name"] = m.get("name")
                    member_info["position"] = m.get("position")
                    # member_info["position"] = ",".join(list(set(m.get("POSITION"))))

                    members.append(member_info)
                gongshang["members"] = members

            changinfo = []
            if c["content"].has_key("change_records") and len(
                    c['content']['change_records']) > 0:
                changinfoList = c["content"]["change_records"]
                for change in changinfoList:
                    change_info = {}
                    change_info["changeTime"] = change.get("date")
                    change_info["contentBefore"] = change.get("before")
                    change_info["contentAfter"] = change.get("after")
                    change_info["changeItem"] = change.get("item")

                    changinfo.append(change_info)
                gongshang["changeInfo"] = changinfo
            else:
                gongshang["changeInfo"] = []

            invests_new = []
            if c["content"].has_key("invests") and len(
                    c['content']['invests']) > 0:
                investlist = c["content"]["invests"]
                for invest in investlist:
                    if invest.has_key("name") and invest["name"] is not None:
                        invest['name'] = invest['name'].replace("(",
                                                                "(").replace(
                                                                    ")", ")")
                        invests_new.append(invest)
                        add_gongshang_name(invest["name"])

                gongshang["invests_new"] = invests_new

            if c["content"].has_key("contact") and len(
                    c['content']['contact']) > 0:
                gongshang['contact'] = c['content']['contact']

            if len(gongshang) == 1:
                logger.info('no content:%s', c["key"])
            else:
                try:
                    logger.info(
                        json.dumps(gongshang,
                                   ensure_ascii=False,
                                   cls=util.CJsonEncoder))
                except:
                    pass

                save_collection_goshang(collection_goshang, gongshang)
                save_collection_goshang_his(collection_goshang_history,
                                            gongshang)
            parser_db_util.update_processed(c["_id"])
            logger.info("processed %s", c["key"])

        if len(items) == 0:
            break
Beispiel #14
0
def process():
    skip = 0
    limit = 1000

    num = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, skip, limit)
        # items = [parser_db_util.find_process_one(SOURCE, TYPE, 837630)]

        # skip += limit
        for c in items:
            num += 1

            logger.info("%s: %s" % (num, c["key"]))

            content = c['content']
            d = pq(html.fromstring(content))
            name = d(':contains("公司名称")+ td').text()
            if c["content"] is None:
                logger.info('%s content is None', c["key"])
                parser_db_util.update_processed(c["_id"])
                continue

            if name is None or name == '':
                logger.info('%s missing fullName', c["key"])
                parser_db_util.update_processed(c["_id"])
                continue

            stockwebsite = 'http://www.szse.cn/main/marketdata/hqcx/hqlb/index.shtml?code=%s' % c['key']
            listingDate = d(':contains("A股上市日期")+ td').text()
            listingDate = datetime.datetime.strptime(listingDate, '%Y-%m-%d') if len(listingDate) > 0 else ''
            website = d(':contains("公司网址")+ td').text()

            parserContent = {
                "source": SOURCE,
                "sourceId": int(c['key']),
                "stockwebsite": stockwebsite,
                "website": website,
                'listingDate': listingDate

            }

            # content['stamp'] = datetime.datetime.strptime(re.sub('\..+', '', content['stamp']), '%Y-%m-%d %H:%M:%S')
            # content['baseinfo']['listingDate']=datetime.datetime.strptime(content['baseinfo']['listingDate'], '%Y%m%d')

            # if content.has_key('executives'):
            #     for executive in content['executives']:
            #         dateTransed = datetime.datetime.strptime(executive['START_TIME'], '%Y-%m-%d')
            #         content['executives'][content['executives'].index(executive)]['START_TIME'] = dateTransed

            # content = parseContent(content)

            # parserContent.update(content.pop('baseinfo'))
            # content['baseinfo'] = content['baseinfo'][0]
            parserContent['baseinfo'] = {
                'shortname': d(':contains("A股简称")+ td').text(),
                'englishName': d(':contains("英文名称")+ td').text(),
                'regLocation': d(':contains("注册地址")+ td').text(),
                'totalStockEquity': d(':contains("A股总股本")+ td').text(),
                'region': d(':contains("地      区")+ td').text(),
                'province': d(':contains("省    份")+ td').text(),
                'city': d(':contains("城     市")+ td').text(),
                'industry': d(':contains("所属行业")+ td').text(),

            }
            parserContent['executives'] = []

            parserContent['name'] = name

            # parserContent.update(content)

            logger.info(json.dumps(parserContent, ensure_ascii=False, cls=util.CJsonEncoder))

            save_collection(collection, parserContent)
            parser_db_util.update_processed(c["_id"])
            logger.info("processed %s", c["key"])

        # time.sleep(1)
        if len(items) == 0:
            logger.info("no more items")
            break
Beispiel #15
0
def process():
    skip = 0
    limit = 1000

    num = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, limit)
        # items = [parser_db_util.find_process_one_key(SOURCE, TYPE, u'哎哎信息科技(上海)有限公司')]
        # items = [parser_db_util.find_process_one_key(SOURCE, TYPE, u'北京真格天投股权投资中心(有限合伙)')]

        # skip += limit
        for c in items:
            num += 1

            logger.info("%s: %s" % (num, c["key"]))
            gongshang = {'name': c["key"]}

            if c['content'].has_key('A1'):
                base = pq(html.fromstring(c['content']['A1'].decode("utf-8")))
                item = base('BASIC ITEM')

                if len(item) > 0:
                    base = base(item)

                    def getItem(key):
                        if base(key).text() != '':
                            return base(key).text()
                        else:
                            return None

                    baseInfo = {
                        "name": getItem('ENTNAME'),
                        "regCapital": getItem('REGCAP'),
                        "industry": getItem('INDUSTRYPHY'),
                        "regInstitute": getItem('REGORG'),
                        "establishTime": datetime.datetime.strptime(getItem('ESDATE'), '%Y-%m-%d') if getItem(
                            'ESDATE') is not None else None,
                        "base": getItem('REGORGPROVINCE'),
                        "regNumber": getItem('REGNO'),
                        "regStatus": getItem('ENTSTATUS'),
                        "fromTime": datetime.datetime.strptime(getItem('OPFROM'), '%Y-%m-%d') if getItem(
                            'OPFROM') is not None else None,
                        "toTime": datetime.datetime.strptime(getItem('OPTO'), '%Y-%m-%d') if getItem(
                            'OPTO') is not None else None,
                        "businessScope": getItem('CBUITEM'),
                        "regLocation": getItem('OPLOC'),
                        "companyOrgType": getItem('ENTTYPE'),
                        # "legalPersonId": getItem('REGORGPROVINCE'),
                        "legalPersonName": getItem('FRNAME')
                    }

                    # gongshang.update(baseInfo)
                    record = collection_goshang.find_one({"name": c["key"]})
                    for key in baseInfo:
                        if record is None:
                            gongshang[key] = baseInfo[key]
                        else:
                            if baseInfo[key] is None and record.has_key(key):
                                logger.info("%s is None, don't update" % key)
                            else:
                                gongshang[key] = baseInfo[key]

            if c['content'].has_key('B1'):
                investors = []
                htmlRaw = pq(html.fromstring(c['content']['B1'].decode("utf-8")))
                item = htmlRaw('SHAREHOLDER ITEM')

                if len(item) > 0:
                    for investor in item:
                        i = htmlRaw(investor)
                        investor_info = {}
                        # investor_info["type"] = i('')
                        investor_info["name"] = i("SHANAME").text()

                        investors.append(investor_info)

                    gongshang["investors"] = investors

            members = []
            if c["content"].has_key("B3") > 0:
                htmlRaw = pq(html.fromstring(c['content']['B3'].decode("utf-8")))
                item = htmlRaw('PERSON ITEM')

                if len(item) > 0:
                    for member in item:
                        def getItem(key):
                            if m(key).text() != '':
                                return m(key).text()
                            else:
                                return None

                        m = htmlRaw(member)
                        member_info = {}
                        member_info["name"] = getItem("PERNAME")
                        member_info["position"] = getItem("POSITION")
                        # member_info["position"] = ",".join(list(set(m.get("POSITION"))))

                        members.append(member_info)
                    gongshang["members"] = members

            changinfo = []
            if c["content"].has_key("A2") > 0:
                htmlRaw = pq(html.fromstring(c['content']['A2'].decode("utf-8")))
                item = htmlRaw('ALTER ITEM')

                if len(item) > 0:
                    for change in item:
                        def getItem(key):
                            if change(key).text() != '':
                                return change(key).text()
                            else:
                                return None

                        change = htmlRaw(change)
                        change_info = {}
                        change_info["changeTime"] = getItem("ALTDATE")
                        change_info["contentBefore"] = getItem("ALTBE")
                        change_info["contentAfter"] = getItem("ALTAF")
                        change_info["changeItem"] = getItem("ALTITEM")

                        changinfo.append(change_info)
                    gongshang["changeInfo"] = changinfo

            invests_new = []
            if c["content"].has_key("B7"):
                htmlRaw = pq(html.fromstring(c['content']['B7'].decode("utf-8")))
                item = htmlRaw('ENTINV ITEM')

                if len(item) > 0:
                    for i in item:
                        invest = htmlRaw(i)
                        if invest("ENTNAME") and invest("ENTNAME").text() != '':
                            invests_new.append(invest("ENTNAME").text())

                    gongshang["invests_new"] = invests_new

            parser_db_util.update_processed(c["_id"])
            if len(gongshang) == 1:
                logger.info('no content:%s', c["key"])
            else:
                logger.info(json.dumps(gongshang, ensure_ascii=False, cls=util.CJsonEncoder))

                save_collection_goshang(collection_goshang, gongshang)
                save_collection_goshang_his(collection_goshang_history, gongshang)
            logger.info("processed %s", c["key"])

        if len(items) == 0:
            break
Beispiel #16
0
def process():
    logger.info("xtecher_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)

        for item in items:
            r = parse_company(item)
            if r == 0:
                parser_db_util.update_processed(item["_id"])
                logger.info("missing website and companyName, processed %s",
                            item["url"])
                continue

            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["name"], 12020)
            if r.has_key('fakeName'):
                parser_db_util.save_source_company_name(
                    source_company_id, r["fakeName"], 12020)
            else:
                parser_db_util.save_source_company_name(
                    source_company_id, r["fullName"], 12010)
                main_company_name = name_helper.get_main_company_name(
                    r["fullName"])
                if main_company_name != r["fullName"]:
                    parser_db_util.save_source_company_name(
                        source_company_id, main_company_name, 12010)
            logger.info("source_company_id=%s", source_company_id)

            artifacts = parse_artifact(source_company_id, r)
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))

            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)

            # parser_db_util.delete_funding(source_company_id)
            # flag=parseFinance_save(source_company_id,item, download_crawler)
            flag = True

            if flag:
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
            else:
                logger.info("lack something:  %s", item["url"])

                # break
        # start += 1000  # todo
        if len(items) == 0:
            break

    logger.info("xtecher_company_parser end.")
Beispiel #17
0
def process(sourceId=0):
    logger.info("evervc_company_parser begin...")

    start = 0
    while True:
        if sourceId > 0:
            items = [parser_db_util.find_process_one(SOURCE, TYPE, sourceId)]
        else:
            items = parser_db_util.find_process_limit(SOURCE, TYPE, start,
                                                      1000)

        for item in items:
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["fullName"], 12010)
            if len(r["name"]) < len(
                    r["fullName"]
            ) or r['fullName'] is None or r["fullName"] == '':
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
            main_company_name = name_helper.get_main_company_name(
                r["fullName"])
            if main_company_name != r["fullName"]:
                parser_db_util.save_source_company_name(
                    source_company_id, main_company_name, 12010)
            logger.info("source_company_id=%s", source_company_id)

            artifacts = parse_artifact(source_company_id, r)
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))

            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)

            parseMember_save(source_company_id, item, download_crawler)

            parser_db_util.delete_funding(source_company_id)  ##??
            flag = parseFinance_save(source_company_id, item, r['sourceId'],
                                     download_crawler)
            flag = True

            if flag:
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
            else:
                logger.info("lack something:  %s", item["url"])

                # break

        # start += 1000  # todo
        if len(items) == 0 or sourceId > 0:
            break

    logger.info("evervc_company_parser end.")
Beispiel #18
0
def process():
    logger.info("36kr_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        # items = [parser_db_util.find_process_one(SOURCE,TYPE,83700389)]
        for item in items:
            try:
                r = parse_company(item)
                logger.info(
                    json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

                source_company_id = parser_db_util.save_company_standard(
                    r, download_crawler)
                parser_db_util.delete_source_company_name(source_company_id)
                parser_db_util.delete_source_mainbeianhao(source_company_id)
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
                if r["fullName"] is not None:
                    parser_db_util.save_source_company_name(
                        source_company_id, r["fullName"], 12010)
                    main_company_name = name_helper.get_main_company_name(
                        r["fullName"])
                    if main_company_name != r["fullName"]:
                        parser_db_util.save_source_company_name(
                            source_company_id, main_company_name, 12010)
                logger.info("source_company_id=%s", source_company_id)

                artifacts = parse_artifact(source_company_id, item)
                logger.info(
                    json.dumps(artifacts,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))

                if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \
                    and len(artifacts) == 0:
                    parser_db_util.update_active(SOURCE, item["key"], 'N')
                    parser_db_util.update_processed(item["_id"])
                    logger.info("missing all stuff, processed %s", item["url"])
                    continue

                parser_db_util.save_artifacts_standard(source_company_id,
                                                       artifacts)

                parseMember_save(source_company_id, item, download_crawler)
                #
                parser_db_util.delete_funding(source_company_id)
                flag = parseFinance_save(source_company_id, item,
                                         download_crawler)
                flag = True
            except Exception, E:
                logger.info(E)
                pass
            # if flag:
            parser_db_util.update_processed(item["_id"])
            logger.info("processed %s", item["url"])
            # else:
            #     logger.info("lack something:  %s", item["url"])

            #break
        #break
        if len(items) == 0:
            break
Beispiel #19
0
def process():
    logger.info("lagou_company_parser begin...")
    bnames = get_blacklist()
    while True:

        items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 1000)
        # items = [parser_db_util.find_process_one(SOURCE, TYPE, 109625)]

        for item in items:

            r = parse_company(item)
            #if r is None:
            #    continue
            if r.has_key("name") and r["name"].strip() != "":
                for bname in bnames:
                    if r["name"].find(bname) >= 0:
                        logger.info("黑名单")
                        r["status"] = "No_Name"
                        break

            if r["status"] == "No_Name":
                parser_db_util.update_active(SOURCE, item["key"], 'N')
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s with no data", item["url"])
                continue

            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))
            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            logger.info("sourceCompanyId : %s", source_company_id)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            if len(r["name"]) < len(r["fullName"]):
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["fullName"], 12010)

            artifacts = []
            artifacts.extend(r["artifacts"])
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            #artifact provided in lagou do not have any links, ignore that
            #artifacts = parse_artifact(source_company_id, item)
            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)
            parseMember_save(source_company_id, item)

            parserDevelop_save(source_company_id, item)

            # job = parser_db_util.find_process_one(SOURCE,36010, item["key_int"])
            # if job:
            #     source_jobs = lagou_job_parser.parse_companyjobs_save(source_company_id, job)
            #     if len(source_jobs) > 0:
            #         parser_db_util.save_jobs_standard(source_jobs)
            #     parser_db_util.update_processed(job["_id"])
            parser_db_util.update_processed(item["_id"])

            #exit()

        if len(items) == 0:
            break

        #break

    logger.info("lagou_company_parser end.")
Beispiel #20
0
def process():
    skip = 0
    limit = 1000

    num = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, limit)
        # items = [parser_db_util.find_process_one(SOURCE, TYPE, 2310299181)]

        # skip += limit
        for c in items:
            num += 1

            logger.info("%s: %s" % (num, c["key"]))
            gongshang = {'name': c["key"]}

            if c['content'].has_key('getRegistInfo') and len(
                    c['content']['getRegistInfo']) > 0 and isinstance(
                        c['content']['getRegistInfo'], list):
                base = c['content']['getRegistInfo'][0]
                if base.get("OPTO") is not None:
                    try:
                        toTime = datetime.datetime.strptime(
                            base.get("OPTO"), '%Y-%m-%d')
                    except:
                        toTime = base.get("OPTO")
                else:
                    toTime = None

                if base.get("OPFROM") is not None:
                    try:
                        fromTime = datetime.datetime.strptime(
                            base.get("OPFROM"), '%Y-%m-%d')
                    except:
                        fromTime = base.get("OPFROM")
                else:
                    fromTime = None

                baseInfo = {
                    "name":
                    base["ENTNAME"],
                    "regCapital":
                    base.get("REGCAP"),
                    # "industry": base.get("industry"),
                    # "regInstitute": base.get("regInstitute"),
                    "establishTime":
                    datetime.datetime.strptime(base.get("ESDATE"), '%Y-%m-%d')
                    if base.get("ESDATE") else None,
                    # "base": base.get("base"),
                    "regNumber":
                    base.get("REGNO"),
                    "regStatus":
                    base.get("ENTSTATUS"),
                    "fromTime":
                    fromTime,
                    "toTime":
                    toTime,
                    "businessScope":
                    base.get("OPSCOPE"),
                    "regLocation":
                    base.get("DOM"),
                    "companyOrgType":
                    base.get("ENTTYPE"),
                    # "legalPersonId": base.get("legalPersonId"),
                    "legalPersonName":
                    base.get("FRNAME")
                }
                gongshang.update(baseInfo)
            else:
                record = collection_goshang.find_one({"name": c["key"]})
                if record is None:
                    logger.info(
                        "No gongshang data before for this missing registinfo company: %s",
                        c["key"])
                    parser_db_util.update_processed(c["_id"])
                    continue

            if c['content'].has_key('getShareHolderInfo') and len(
                    c['content']['getShareHolderInfo']) > 0 and isinstance(
                        c['content']['getShareHolderInfo'], list):
                investors = []
                investorlist = c["content"]["getShareHolderInfo"]
                # logger.info(len(investorlist))
                for i in investorlist:
                    investor_info = {}
                    investor_info["type"] = i.get("INVTYPE")
                    investor_info["name"] = i.get("SHANAME")

                    investors.append(investor_info)
                gongshang["investors"] = investors

            members = []
            if c["content"].has_key("getMainManagerInfo") and len(
                    c['content']['getMainManagerInfo']) > 0 and isinstance(
                        c['content']['getMainManagerInfo'], list):
                memberlist = c["content"]["getMainManagerInfo"]
                for m in memberlist:
                    member_info = {}
                    member_info["name"] = m.get("NAME")
                    member_info["position"] = m.get("POSITION")
                    # member_info["position"] = ",".join(list(set(m.get("POSITION"))))

                    members.append(member_info)
                gongshang["members"] = members

            changinfo = []
            if c["content"].has_key("getRegisterChangeInfo") and len(
                    c['content']['getRegisterChangeInfo']) > 0 and isinstance(
                        c['content']['getRegisterChangeInfo'], list):
                changinfoList = c["content"]["getRegisterChangeInfo"]
                for change in changinfoList:
                    change_info = {}
                    change_info["changeTime"] = change.get("ALTDATE")
                    change_info["contentBefore"] = change.get("ALTBE")
                    change_info["contentAfter"] = change.get("ALTAF")
                    change_info["changeItem"] = change.get("ALTITEM")

                    changinfo.append(change_info)
                gongshang["changeInfo"] = changinfo

            # invests_new = []
            # if c["content"].has_key("getInvestmentAbroadInfo") and len(
            #         c['content']['getInvestmentAbroadInfo']) > 0 and isinstance(c['content']['getInvestmentAbroadInfo'],
            #                                                                     list):
            #     investlist = c["content"]["getInvestmentAbroadInfo"]
            #     for invest in investlist:
            #         if invest.has_key("ENTNAME") and invest["ENTNAME"] is not None:
            #             invests_new.append(invest["ENTNAME"])
            #
            #     gongshang["invests_new"] = invests_new

            if len(gongshang) == 1:
                logger.info('no content:%s', c["key"])
            else:
                logger.info(
                    json.dumps(gongshang,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))

                save_collection_goshang(collection_goshang, gongshang)
                save_collection_goshang_his(collection_goshang_history,
                                            gongshang)
            parser_db_util.update_processed(c["_id"])
            logger.info("processed %s", c["key"])

        if len(items) == 0:
            break
def process():
    logger.info('crunchbase_company_parser begin ...')
    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 500)
        # mongo = db.connect_mongo()
        # collection = mongo.raw.projectdata
        # items = list(collection.find({"_id" : ObjectId("5b02a14fdeb4717184810e22")}))
        for item in items:
            if item is None:
                continue
            try:
                r = parse_company(item)
                logger.info(
                    json.dumps(r,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder,
                               indent=2))
                # source_company (2010 running)
                source_company_id = parser_db_util.save_company_standard(
                    r, download_crawler)
                logger.info('%s:%s' % (item['name'], source_company_id))
                parser_db_util.delete_source_company_name(source_company_id)
                parser_db_util.delete_source_mainbeianhao(source_company_id)
                # source_company_name (12020 shortname)
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)

                artifacts = parse_artifact(source_company_id, item)
                logger.info(
                    json.dumps(artifacts,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder,
                               indent=2))

                if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \
                    and len(artifacts) == 0:
                    parser_db_util.update_active(SOURCE, item["key"], 'N')
                    parser_db_util.update_processed(item["_id"])
                    logger.info("missing all stuff, processed %s", item["url"])
                    continue

                # source_artifact (4010 website)
                parser_db_util.save_artifacts_standard(source_company_id,
                                                       artifacts)

                # source_member and source_company_member_rel(5010 ceo)
                parseMember_save(source_company_id, item, download_crawler)

                parser_db_util.delete_funding(source_company_id)
                # source_funding and source_funding_investor_rel (10020 vc)
                parseFinance_save(source_company_id, item, download_crawler)

            except Exception, E:
                logger.info(E)
                pass

            parser_db_util.update_processed(item["_id"])
            logger.info("processed %s" % item["url"])
        # break
        if len(items) == 0:
            break
        logger.info('parser end.')
        return
Beispiel #22
0
def process():
    skip = 0
    limit = 1000

    num = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, skip, limit)
        # items = [parser_db_util.find_process_one(SOURCE, TYPE, 837630)]

        # skip += limit
        for c in items:
            num += 1
            logger.info("%s: %s" % (num, c["key"]))

            if c['content'].has_key("baseinfo") is False:
                c['content']["baseinfo"] = c['content']["bbseinfo"]

            if c["content"] is None or (c['content'].has_key('ret')
                                        and c['content']['ret'] == -999):
                logger.info('%s content is None', c["key"])
                parser_db_util.update_processed(c["_id"])
                continue
            if c['content']["baseinfo"].has_key("name") is False:
                logger.info('%s missing fullName', c["key"])
                parser_db_util.update_processed(c["_id"])
                continue

            content = c['content']

            stockwebsite = 'http://www.neeq.com.cn/nq/detailcompany.html?companyCode=%s&typeId=1&typename=G' % c[
                'key']

            content['stamp'] = datetime.datetime.strptime(
                re.sub('\..+', '', content['stamp']), '%Y-%m-%d %H:%M:%S')
            content['baseinfo']['listingDate'] = datetime.datetime.strptime(
                content['baseinfo']['listingDate'], '%Y%m%d')

            if content.has_key('topTenHolders'):
                for holder in content['topTenHolders']:
                    dateTransed = datetime.datetime.strptime(
                        holder['date'], '%Y-%m-%d')
                    content['topTenHolders'][content['topTenHolders'].index(
                        holder)]['date'] = dateTransed

            parserContent = {
                "source": SOURCE,
                "sourceId": int(c['key']),
                "stockwebsite": stockwebsite,
                # "website":content['baseinfo']['website'],
                "name": content['baseinfo']['name'],
                "listingDate": content['baseinfo']['listingDate'],
            }

            content = parseContent(content)
            if content['baseinfo'].has_key('website'):
                parserContent["website"] = content['baseinfo']['website']

            # parserContent.update(content.pop('baseinfo'))
            parserContent.update(content)

            logger.info(
                json.dumps(parserContent,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))

            save_collection(collection, parserContent)
            parser_db_util.update_processed(c["_id"])
            logger.info("processed %s", c["key"])

        # time.sleep(1)
        if len(items) == 0:
            logger.info('no more new item')
            break
Beispiel #23
0
def process():
    skip = 0
    limit = 1000

    num = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, skip, limit)
        # items = [parser_db_util.find_process_one(SOURCE, TYPE, 837630)]

        # skip += limit
        for c in items:
            num += 1
            if c["content"] is None or c["content"]['baseinfo'][0].has_key(
                    "FULLNAME") is False:
                logger.info('%s content is None', c["key"])
                parser_db_util.update_processed(c["_id"])
                continue

            logger.info("%s: %s" % (num, c["key"]))

            content = c['content']

            stockwebsite = 'http://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE=%s' % c[
                'key']
            listingDate = datetime.datetime.strptime(
                content.pop('listingDate')[0]['LISTINGDATEA'],
                '%Y-%m-%d') if len(content['listingDate']) > 0 else None

            parserContent = {
                "source": SOURCE,
                "sourceId": int(c['key']),
                "stockwebsite": stockwebsite,
                "website": content['baseinfo'][0].get('WWW_ADDRESS', None),
                'listingDate': listingDate
            }

            # content['stamp'] = datetime.datetime.strptime(re.sub('\..+', '', content['stamp']), '%Y-%m-%d %H:%M:%S')
            # content['baseinfo']['listingDate']=datetime.datetime.strptime(content['baseinfo']['listingDate'], '%Y%m%d')

            if content.has_key('executives'):
                for executive in content['executives']:
                    dateTransed = datetime.datetime.strptime(
                        executive['START_TIME'], '%Y-%m-%d')
                    content['executives'][content['executives'].index(
                        executive)]['START_TIME'] = dateTransed

            content = parseContent(content)

            # parserContent.update(content.pop('baseinfo'))
            content['baseinfo'] = content['baseinfo'][0]
            content['name'] = content['baseinfo']['FULLNAME']
            content['baseinfo']['shortname'] = content['baseinfo'].pop(
                'COMPANY_ABBR')

            parserContent.update(content)

            logger.info(
                json.dumps(parserContent,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))

            save_collection(collection, parserContent)
            parser_db_util.update_processed(c["_id"])
            logger.info("processed %s", c["key"])

        # time.sleep(1)
        if len(items) == 0:
            logger.info("no more items")
            break