Ejemplo n.º 1
0
def process():
    logger.info("sse_company_parser begin...")

    start = 0
    while True:
        mongo = db.connect_mongo()
        collection = mongo.stock.sse
        items = list(collection.find({"processStatus": 1}).limit(100))

        for item in items:
            # try:
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["name"], 12020)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["fullName"], 12010)
            main_company_name = name_helper.get_main_company_name(
                r["fullName"])
            if main_company_name != r["fullName"]:
                parser_db_util.save_source_company_name(
                    source_company_id, main_company_name, 12010)
            logger.info("source_company_id=%s", source_company_id)

            if r["englishName"] is not None and r["englishName"].strip() != "" and r["englishName"].strip() != "-" \
                and r["englishName"].strip() != "null" and r["englishName"].strip() != "无":
                parser_db_util.save_source_company_name(
                    source_company_id, r["englishName"], 12010)

            # source_company_id = None
            artifacts = parse_artifact(source_company_id, item)
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))

            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)

            parseMember_save(source_company_id, item, download_crawler)

            collection.update({"_id": item["_id"]},
                              {"$set": {
                                  "processStatus": 2
                              }})
            logger.info("processed %s", item["sourceId"])

        # break
        mongo.close()
        if len(items) == 0:
            break

    logger.info("sse_company_parser end.")
Ejemplo n.º 2
0
def process():
    logger.info("Chuangyepu_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        #items = [parser_db_util.find_process_one(SOURCE, TYPE, 1)]

        for item in items:
            #if item['key_int'] != 1:
            #    continue
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            if r["status"] == "No_Data" or r["status"] == "No_Name":
                parser_db_util.update_active(SOURCE, item["key"], 'N')
                parser_db_util.update_processed(item["_id"])
                logger.info("No infos for %s", item["url"])
                exit()
                continue

            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["name"], 12020)

            logger.info("source_company_id=%s", source_company_id)

            artifacts = []
            artifacts.extend(r["artifacts"])
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)

            parser_db_util.delete_funding(source_company_id)
            flag = parseFinance_save(source_company_id, r['fundings'],
                                     download_crawler)
            if flag:
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
            else:
                logger.info("lack something:  %s", item["url"])
                exit()

        break

    logger.info("Chuangyepu_company_parser end.")
Ejemplo n.º 3
0
def process():
    logger.info("itjuzi_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        # items = [parser_db_util.find_process_one(SOURCE, TYPE, 33045986)]
        for item in items:
            logger.info(item["url"])

            r = parse_base(item)
            if r is None:
                continue
            source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id, r["shortName"],12020)
            parser_db_util.save_source_company_name(source_company_id, r["productName"],12020)
            if r["fullName"] is not None:
                parser_db_util.save_source_company_name(source_company_id, r["fullName"],12010)
                main_company_name = name_helper.get_main_company_name(r["fullName"])
                if main_company_name != r["fullName"]:
                    parser_db_util.save_source_company_name(source_company_id, main_company_name,12010)

            logger.info("source_company_id=%s", source_company_id)

            artifacts = parse_artifact(item)
            flag = False
            if len(artifacts) > 0:
                flag = True

            artifacts.extend(r["artifacts"])
            logger.info(artifacts)
            parser_db_util.save_artifacts(source_company_id, artifacts)

            footprints = parse_footprint(item)
            parser_db_util.save_footprints(source_company_id, footprints)

            # members = parse_member(item)
            # parser_db_util.save_member_rels(source_company_id, members, SOURCE)
            parseMember_save(source_company_id, item, download_crawler)

            parser_db_util.update_processed(item["_id"])

            #if flag:
        # break
        start += 1000
        if len(items) == 0:
            break

    logger.info("itjuzi_company_parser end.")
Ejemplo n.º 4
0
            # items = list(collection.find({"_id" : ObjectId("5ad7ef121045403178ed4135")}).limit(100))
            items = list(collection.find({"url":"http://vip.api.qimingpian.com/d/c3", "processed":None},
                                         {"data.basic":1,"postdata":1,"productinfos":1,"url":1}))
            logger.info("items : %s", len(items))
            for item in items:
                # if item.has_key("processed") and item["processed"] is True:
                #     continue
                try:
                    logger.info(item)
                    r = parse_company(item)
                    # logger.info(json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))
                    for i in r:
                        logger.info("%s - %s",i,r[i])
                    source_company_id = parser_db_util.save_company_standard(r, download_crawler)
                    parser_db_util.delete_source_company_name(source_company_id)
                    parser_db_util.delete_source_mainbeianhao(source_company_id)
                    parser_db_util.save_source_company_name(source_company_id, r["name"], 12020)
                    parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010)
                    main_company_name = name_helper.get_main_company_name(r["fullName"])
                    if main_company_name != r["fullName"]:
                        parser_db_util.save_source_company_name(source_company_id, main_company_name, 12010)
                    logger.info("source_company_id=%s", source_company_id)

                    artifacts = []
                    artifacts.extend(r["artifacts"])
                    logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder))


                    parser_db_util.save_artifacts_standard(source_company_id, artifacts)
                    #
                    # #
Ejemplo n.º 5
0
def process():
    logger.info("xtecher_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)

        for item in items:
            r = parse_company(item)
            if r == 0:
                parser_db_util.update_processed(item["_id"])
                logger.info("missing website and companyName, processed %s",
                            item["url"])
                continue

            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["name"], 12020)
            if r.has_key('fakeName'):
                parser_db_util.save_source_company_name(
                    source_company_id, r["fakeName"], 12020)
            else:
                parser_db_util.save_source_company_name(
                    source_company_id, r["fullName"], 12010)
                main_company_name = name_helper.get_main_company_name(
                    r["fullName"])
                if main_company_name != r["fullName"]:
                    parser_db_util.save_source_company_name(
                        source_company_id, main_company_name, 12010)
            logger.info("source_company_id=%s", source_company_id)

            artifacts = parse_artifact(source_company_id, r)
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))

            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)

            # parser_db_util.delete_funding(source_company_id)
            # flag=parseFinance_save(source_company_id,item, download_crawler)
            flag = True

            if flag:
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
            else:
                logger.info("lack something:  %s", item["url"])

                # break
        # start += 1000  # todo
        if len(items) == 0:
            break

    logger.info("xtecher_company_parser end.")
Ejemplo n.º 6
0
def process():
    logger.info('crunchbase_company_parser begin ...')
    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 500)
        # mongo = db.connect_mongo()
        # collection = mongo.raw.projectdata
        # items = list(collection.find({"_id" : ObjectId("5b02a14fdeb4717184810e22")}))
        for item in items:
            if item is None:
                continue
            try:
                r = parse_company(item)
                logger.info(
                    json.dumps(r,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder,
                               indent=2))
                # source_company (2010 running)
                source_company_id = parser_db_util.save_company_standard(
                    r, download_crawler)
                logger.info('%s:%s' % (item['name'], source_company_id))
                parser_db_util.delete_source_company_name(source_company_id)
                parser_db_util.delete_source_mainbeianhao(source_company_id)
                # source_company_name (12020 shortname)
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)

                artifacts = parse_artifact(source_company_id, item)
                logger.info(
                    json.dumps(artifacts,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder,
                               indent=2))

                if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \
                    and len(artifacts) == 0:
                    parser_db_util.update_active(SOURCE, item["key"], 'N')
                    parser_db_util.update_processed(item["_id"])
                    logger.info("missing all stuff, processed %s", item["url"])
                    continue

                # source_artifact (4010 website)
                parser_db_util.save_artifacts_standard(source_company_id,
                                                       artifacts)

                # source_member and source_company_member_rel(5010 ceo)
                parseMember_save(source_company_id, item, download_crawler)

                parser_db_util.delete_funding(source_company_id)
                # source_funding and source_funding_investor_rel (10020 vc)
                parseFinance_save(source_company_id, item, download_crawler)

            except Exception, E:
                logger.info(E)
                pass

            parser_db_util.update_processed(item["_id"])
            logger.info("processed %s" % item["url"])
        # break
        if len(items) == 0:
            break
        logger.info('parser end.')
        return
Ejemplo n.º 7
0
def process():
    logger.info("36kr_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000)
        # items = [parser_db_util.find_process_one(SOURCE,TYPE,83700389)]
        for item in items:
            try:
                r = parse_company(item)
                logger.info(
                    json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

                source_company_id = parser_db_util.save_company_standard(
                    r, download_crawler)
                parser_db_util.delete_source_company_name(source_company_id)
                parser_db_util.delete_source_mainbeianhao(source_company_id)
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
                if r["fullName"] is not None:
                    parser_db_util.save_source_company_name(
                        source_company_id, r["fullName"], 12010)
                    main_company_name = name_helper.get_main_company_name(
                        r["fullName"])
                    if main_company_name != r["fullName"]:
                        parser_db_util.save_source_company_name(
                            source_company_id, main_company_name, 12010)
                logger.info("source_company_id=%s", source_company_id)

                artifacts = parse_artifact(source_company_id, item)
                logger.info(
                    json.dumps(artifacts,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))

                if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \
                    and len(artifacts) == 0:
                    parser_db_util.update_active(SOURCE, item["key"], 'N')
                    parser_db_util.update_processed(item["_id"])
                    logger.info("missing all stuff, processed %s", item["url"])
                    continue

                parser_db_util.save_artifacts_standard(source_company_id,
                                                       artifacts)

                parseMember_save(source_company_id, item, download_crawler)
                #
                parser_db_util.delete_funding(source_company_id)
                flag = parseFinance_save(source_company_id, item,
                                         download_crawler)
                flag = True
            except Exception, E:
                logger.info(E)
                pass
            # if flag:
            parser_db_util.update_processed(item["_id"])
            logger.info("processed %s", item["url"])
            # else:
            #     logger.info("lack something:  %s", item["url"])

            #break
        #break
        if len(items) == 0:
            break
Ejemplo n.º 8
0
def process():
    logger.info("36kr_company_parser begin...")

    start = 0
    while True:
        items = parser_db_util.find_all_limit(SOURCE, TYPE, start, 1000)

        for item in items:
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))
            if r["status"] == "INIT":
                parser_db_util.update_active(SOURCE, item["key"], 'N')
                #parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
                continue

            parser_db_util.update_active(SOURCE, item["key"], None)

            sc = parser_db_util.get_source_company_by_source_and_sourceid(
                SOURCE, item["key"])
            if sc is None:
                source_company_id = parser_db_util.save_company_standard(
                    r, download_crawler)
                parser_db_util.delete_source_company_name(source_company_id)
                parser_db_util.delete_source_mainbeianhao(source_company_id)
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
                parser_db_util.save_source_company_name(
                    source_company_id, r["fullName"], 12010)
                main_company_name = name_helper.get_main_company_name(
                    r["fullName"])
                if main_company_name != r["fullName"]:
                    parser_db_util.save_source_company_name(
                        source_company_id, main_company_name, 12010)
                logger.info("source_company_id=%s", source_company_id)

                artifacts = parse_artifact(source_company_id, item)
                logger.info(
                    json.dumps(artifacts,
                               ensure_ascii=False,
                               cls=util.CJsonEncoder))
                parser_db_util.save_artifacts_standard(source_company_id,
                                                       artifacts)

                parser_db_util.delete_funding(source_company_id)
                flag = parseFinance_save(source_company_id, item,
                                         download_crawler)

                if item["content"].has_key("founders") and item["content"][
                        "founders"]["data"].has_key("data"):
                    parseMember_save(
                        source_company_id, 5010,
                        item["content"]["founders"]["data"]["data"],
                        download_crawler)
                if item["content"].has_key("employees") and item["content"][
                        "employees"]["data"].has_key("data"):
                    parseMember_save(
                        source_company_id, 5030,
                        item["content"]["employees"]["data"]["data"],
                        download_crawler)
                if item["content"].has_key("former_members") and item[
                        "content"]["former_members"]["data"].has_key("data"):
                    parseMember_save(
                        source_company_id, 5040,
                        item["content"]["former_members"]["data"]["data"],
                        download_crawler)

                # if flag:
                #     parser_db_util.update_processed(item["_id"])
                #     logger.info("processed %s" ,item["url"])
                # else:
                #     logger.info("lack somethin:  %s", item["url"])

            #break
        start += 1000
        if len(items) == 0:
            break

    logger.info("36kr_company_parser end.")
Ejemplo n.º 9
0
def process():
    logger.info("lagou_company_parser begin...")
    bnames = get_blacklist()
    while True:

        items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 1000)
        # items = [parser_db_util.find_process_one(SOURCE, TYPE, 109625)]

        for item in items:

            r = parse_company(item)
            #if r is None:
            #    continue
            if r.has_key("name") and r["name"].strip() != "":
                for bname in bnames:
                    if r["name"].find(bname) >= 0:
                        logger.info("黑名单")
                        r["status"] = "No_Name"
                        break

            if r["status"] == "No_Name":
                parser_db_util.update_active(SOURCE, item["key"], 'N')
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s with no data", item["url"])
                continue

            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))
            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            logger.info("sourceCompanyId : %s", source_company_id)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            if len(r["name"]) < len(r["fullName"]):
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["fullName"], 12010)

            artifacts = []
            artifacts.extend(r["artifacts"])
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))
            #artifact provided in lagou do not have any links, ignore that
            #artifacts = parse_artifact(source_company_id, item)
            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)
            parseMember_save(source_company_id, item)

            parserDevelop_save(source_company_id, item)

            # job = parser_db_util.find_process_one(SOURCE,36010, item["key_int"])
            # if job:
            #     source_jobs = lagou_job_parser.parse_companyjobs_save(source_company_id, job)
            #     if len(source_jobs) > 0:
            #         parser_db_util.save_jobs_standard(source_jobs)
            #     parser_db_util.update_processed(job["_id"])
            parser_db_util.update_processed(item["_id"])

            #exit()

        if len(items) == 0:
            break

        #break

    logger.info("lagou_company_parser end.")
Ejemplo n.º 10
0
def process(sourceId=0):
    logger.info("evervc_company_parser begin...")

    start = 0
    while True:
        if sourceId > 0:
            items = [parser_db_util.find_process_one(SOURCE, TYPE, sourceId)]
        else:
            items = parser_db_util.find_process_limit(SOURCE, TYPE, start,
                                                      1000)

        for item in items:
            r = parse_company(item)
            logger.info(
                json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder))

            source_company_id = parser_db_util.save_company_standard(
                r, download_crawler)
            parser_db_util.delete_source_company_name(source_company_id)
            parser_db_util.delete_source_mainbeianhao(source_company_id)
            parser_db_util.save_source_company_name(source_company_id,
                                                    r["fullName"], 12010)
            if len(r["name"]) < len(
                    r["fullName"]
            ) or r['fullName'] is None or r["fullName"] == '':
                parser_db_util.save_source_company_name(
                    source_company_id, r["name"], 12020)
            main_company_name = name_helper.get_main_company_name(
                r["fullName"])
            if main_company_name != r["fullName"]:
                parser_db_util.save_source_company_name(
                    source_company_id, main_company_name, 12010)
            logger.info("source_company_id=%s", source_company_id)

            artifacts = parse_artifact(source_company_id, r)
            logger.info(
                json.dumps(artifacts,
                           ensure_ascii=False,
                           cls=util.CJsonEncoder))

            parser_db_util.save_artifacts_standard(source_company_id,
                                                   artifacts)

            parseMember_save(source_company_id, item, download_crawler)

            parser_db_util.delete_funding(source_company_id)  ##??
            flag = parseFinance_save(source_company_id, item, r['sourceId'],
                                     download_crawler)
            flag = True

            if flag:
                parser_db_util.update_processed(item["_id"])
                logger.info("processed %s", item["url"])
            else:
                logger.info("lack something:  %s", item["url"])

                # break

        # start += 1000  # todo
        if len(items) == 0 or sourceId > 0:
            break

    logger.info("evervc_company_parser end.")