def process(): logger.info("sse_company_parser begin...") start = 0 while True: mongo = db.connect_mongo() collection = mongo.stock.sse items = list(collection.find({"processStatus": 1}).limit(100)) for item in items: # try: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) if r["englishName"] is not None and r["englishName"].strip() != "" and r["englishName"].strip() != "-" \ and r["englishName"].strip() != "null" and r["englishName"].strip() != "无": parser_db_util.save_source_company_name( source_company_id, r["englishName"], 12010) # source_company_id = None artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item, download_crawler) collection.update({"_id": item["_id"]}, {"$set": { "processStatus": 2 }}) logger.info("processed %s", item["sourceId"]) # break mongo.close() if len(items) == 0: break logger.info("sse_company_parser end.")
def process(): logger.info("Chuangyepu_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) #items = [parser_db_util.find_process_one(SOURCE, TYPE, 1)] for item in items: #if item['key_int'] != 1: # continue r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) if r["status"] == "No_Data" or r["status"] == "No_Name": parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("No infos for %s", item["url"]) exit() continue source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) logger.info("source_company_id=%s", source_company_id) artifacts = [] artifacts.extend(r["artifacts"]) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parser_db_util.delete_funding(source_company_id) flag = parseFinance_save(source_company_id, r['fundings'], download_crawler) if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack something: %s", item["url"]) exit() break logger.info("Chuangyepu_company_parser end.")
while True: # items = list(collection.find({"_id" : ObjectId("5ad7ef121045403178ed4135")}).limit(100)) items = list(collection.find({"url":"http://vip.api.qimingpian.com/d/c3", "processed":None}, {"data.basic":1,"postdata":1,"productinfos":1,"url":1})) logger.info("items : %s", len(items)) for item in items: # if item.has_key("processed") and item["processed"] is True: # continue try: logger.info(item) r = parse_company(item) # logger.info(json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) for i in r: logger.info("%s - %s",i,r[i]) source_company_id = parser_db_util.save_company_standard(r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name(r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name(source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = [] artifacts.extend(r["artifacts"]) logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts)
def process(): logger.info("xtecher_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) if r == 0: parser_db_util.update_processed(item["_id"]) logger.info("missing website and companyName, processed %s", item["url"]) continue logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) if r.has_key('fakeName'): parser_db_util.save_source_company_name( source_company_id, r["fakeName"], 12020) else: parser_db_util.save_source_company_name( source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, r) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) # parser_db_util.delete_funding(source_company_id) # flag=parseFinance_save(source_company_id,item, download_crawler) flag = True if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack something: %s", item["url"]) # break # start += 1000 # todo if len(items) == 0: break logger.info("xtecher_company_parser end.")
def process(): logger.info('crunchbase_company_parser begin ...') start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 500) # mongo = db.connect_mongo() # collection = mongo.raw.projectdata # items = list(collection.find({"_id" : ObjectId("5b02a14fdeb4717184810e22")})) for item in items: if item is None: continue try: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder, indent=2)) # source_company (2010 running) source_company_id = parser_db_util.save_company_standard( r, download_crawler) logger.info('%s:%s' % (item['name'], source_company_id)) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) # source_company_name (12020 shortname) parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder, indent=2)) if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \ and len(artifacts) == 0: parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("missing all stuff, processed %s", item["url"]) continue # source_artifact (4010 website) parser_db_util.save_artifacts_standard(source_company_id, artifacts) # source_member and source_company_member_rel(5010 ceo) parseMember_save(source_company_id, item, download_crawler) parser_db_util.delete_funding(source_company_id) # source_funding and source_funding_investor_rel (10020 vc) parseFinance_save(source_company_id, item, download_crawler) except Exception, E: logger.info(E) pass parser_db_util.update_processed(item["_id"]) logger.info("processed %s" % item["url"]) # break if len(items) == 0: break logger.info('parser end.') return
def process(): logger.info("36kr_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) # items = [parser_db_util.find_process_one(SOURCE,TYPE,83700389)] for item in items: try: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) if r["fullName"] is not None: parser_db_util.save_source_company_name( source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \ and len(artifacts) == 0: parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("missing all stuff, processed %s", item["url"]) continue parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item, download_crawler) # parser_db_util.delete_funding(source_company_id) flag = parseFinance_save(source_company_id, item, download_crawler) flag = True except Exception, E: logger.info(E) pass # if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) # else: # logger.info("lack something: %s", item["url"]) #break #break if len(items) == 0: break
def process(): logger.info("36kr_company_parser begin...") start = 0 while True: items = parser_db_util.find_all_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) if r["status"] == "INIT": parser_db_util.update_active(SOURCE, item["key"], 'N') #parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) continue parser_db_util.update_active(SOURCE, item["key"], None) sc = parser_db_util.get_source_company_by_source_and_sourceid( SOURCE, item["key"]) if sc is None: source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) parser_db_util.save_source_company_name( source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parser_db_util.delete_funding(source_company_id) flag = parseFinance_save(source_company_id, item, download_crawler) if item["content"].has_key("founders") and item["content"][ "founders"]["data"].has_key("data"): parseMember_save( source_company_id, 5010, item["content"]["founders"]["data"]["data"], download_crawler) if item["content"].has_key("employees") and item["content"][ "employees"]["data"].has_key("data"): parseMember_save( source_company_id, 5030, item["content"]["employees"]["data"]["data"], download_crawler) if item["content"].has_key("former_members") and item[ "content"]["former_members"]["data"].has_key("data"): parseMember_save( source_company_id, 5040, item["content"]["former_members"]["data"]["data"], download_crawler) # if flag: # parser_db_util.update_processed(item["_id"]) # logger.info("processed %s" ,item["url"]) # else: # logger.info("lack somethin: %s", item["url"]) #break start += 1000 if len(items) == 0: break logger.info("36kr_company_parser end.")
def process(): logger.info("lagou_company_parser begin...") bnames = get_blacklist() while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 1000) # items = [parser_db_util.find_process_one(SOURCE, TYPE, 109625)] for item in items: r = parse_company(item) #if r is None: # continue if r.has_key("name") and r["name"].strip() != "": for bname in bnames: if r["name"].find(bname) >= 0: logger.info("黑名单") r["status"] = "No_Name" break if r["status"] == "No_Name": parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("processed %s with no data", item["url"]) continue logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) logger.info("sourceCompanyId : %s", source_company_id) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) if len(r["name"]) < len(r["fullName"]): parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) artifacts = [] artifacts.extend(r["artifacts"]) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) #artifact provided in lagou do not have any links, ignore that #artifacts = parse_artifact(source_company_id, item) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item) parserDevelop_save(source_company_id, item) # job = parser_db_util.find_process_one(SOURCE,36010, item["key_int"]) # if job: # source_jobs = lagou_job_parser.parse_companyjobs_save(source_company_id, job) # if len(source_jobs) > 0: # parser_db_util.save_jobs_standard(source_jobs) # parser_db_util.update_processed(job["_id"]) parser_db_util.update_processed(item["_id"]) #exit() if len(items) == 0: break #break logger.info("lagou_company_parser end.")
def process(sourceId=0): logger.info("evervc_company_parser begin...") start = 0 while True: if sourceId > 0: items = [parser_db_util.find_process_one(SOURCE, TYPE, sourceId)] else: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) if len(r["name"]) < len( r["fullName"] ) or r['fullName'] is None or r["fullName"] == '': parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, r) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item, download_crawler) parser_db_util.delete_funding(source_company_id) ##?? flag = parseFinance_save(source_company_id, item, r['sourceId'], download_crawler) flag = True if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack something: %s", item["url"]) # break # start += 1000 # todo if len(items) == 0 or sourceId > 0: break logger.info("evervc_company_parser end.")