def insert(shortname, name, brief, website): name = name.replace("(开业)", "") sourceId = util.md5str(name) sid = parser_db_util.save_company_yitai(shortname, name, 13100, sourceId, brief) logger.info("sid:%s->sourceId:%s", sid, sourceId) parser_db_util.save_source_company_name(sid, name, 12010) parser_db_util.save_source_company_name(sid, shortname, 12020) if website is not None and website.strip() != "": website = url_helper.url_normalize(website) if website is not None and website != "": if website.find("http://") == -1 and website.find("https://"): website = "http://" + website type, market, app_id = url_helper.get_market(website) if type == 4010: if website.find('sse.com') > 0: pass else: artifact = { "sourceCompanyId": sid, "name": shortname, "description": None, "link": website, "domain": app_id, "type": type } parser_db_util.save_artifacts_standard(sid, [artifact])
def insert(shortname, name,brief,website): name = name.replace("(开业)","") sourceId = util.md5str(name) sid = parser_db_util.save_company_yitai(shortname, name,13100,sourceId,brief) logger.info("sid:%s->sourceId:%s",sid, sourceId) parser_db_util.save_source_company_name(sid, name, 12010) parser_db_util.save_source_company_name(sid, shortname, 12020)
def insert(shortname,brief): sourceId = util.md5str(unicode(shortname)) sid = parser_db_util.save_company_yitai(shortname, None,13120,sourceId,brief) logger.info("sid:%s->sourceId:%s",sid, sourceId) parser_db_util.save_source_company_name(sid, shortname, 12020) # for fullName in [name] + fullNames: # parser_db_util.save_source_company_name(sid, fullName, 12010) return sid
def insert(shortname, name, brief, fullNames): name = name.replace("(开业)", "") sourceId = util.md5str(name) sid = parser_db_util.save_company_yitai(shortname, name, 13100, sourceId, brief) # logger.info("sid:%s->sourceId:%s",sid, sourceId) parser_db_util.save_source_company_name(sid, shortname, 12020) for fullName in [name] + fullNames: parser_db_util.save_source_company_name(sid, fullName, 12010) return sid
def process(): logger.info("Chuangyepu_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) #items = [parser_db_util.find_process_one(SOURCE, TYPE, 1)] for item in items: #if item['key_int'] != 1: # continue r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) if r["status"] == "No_Data" or r["status"] == "No_Name": parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("No infos for %s", item["url"]) exit() continue source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) logger.info("source_company_id=%s", source_company_id) artifacts = [] artifacts.extend(r["artifacts"]) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parser_db_util.delete_funding(source_company_id) flag = parseFinance_save(source_company_id, r['fundings'], download_crawler) if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack something: %s", item["url"]) exit() break logger.info("Chuangyepu_company_parser end.")
def process(): logger.info("sse_company_parser begin...") start = 0 while True: mongo = db.connect_mongo() collection = mongo.stock.sse items = list(collection.find({"processStatus": 1}).limit(100)) for item in items: # try: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) if r["englishName"] is not None and r["englishName"].strip() != "" and r["englishName"].strip() != "-" \ and r["englishName"].strip() != "null" and r["englishName"].strip() != "无": parser_db_util.save_source_company_name( source_company_id, r["englishName"], 12010) # source_company_id = None artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item, download_crawler) collection.update({"_id": item["_id"]}, {"$set": { "processStatus": 2 }}) logger.info("processed %s", item["sourceId"]) # break mongo.close() if len(items) == 0: break logger.info("sse_company_parser end.")
def process(): logger.info("itjuzi_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) # items = [parser_db_util.find_process_one(SOURCE, TYPE, 33045986)] for item in items: logger.info(item["url"]) r = parse_base(item) if r is None: continue source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["shortName"],12020) parser_db_util.save_source_company_name(source_company_id, r["productName"],12020) if r["fullName"] is not None: parser_db_util.save_source_company_name(source_company_id, r["fullName"],12010) main_company_name = name_helper.get_main_company_name(r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name(source_company_id, main_company_name,12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(item) flag = False if len(artifacts) > 0: flag = True artifacts.extend(r["artifacts"]) logger.info(artifacts) parser_db_util.save_artifacts(source_company_id, artifacts) footprints = parse_footprint(item) parser_db_util.save_footprints(source_company_id, footprints) # members = parse_member(item) # parser_db_util.save_member_rels(source_company_id, members, SOURCE) parseMember_save(source_company_id, item, download_crawler) parser_db_util.update_processed(item["_id"]) #if flag: # break start += 1000 if len(items) == 0: break logger.info("itjuzi_company_parser end.")
items = list(collection.find({"url":"http://vip.api.qimingpian.com/d/c3", "processed":None}, {"data.basic":1,"postdata":1,"productinfos":1,"url":1})) logger.info("items : %s", len(items)) for item in items: # if item.has_key("processed") and item["processed"] is True: # continue try: logger.info(item) r = parse_company(item) # logger.info(json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) for i in r: logger.info("%s - %s",i,r[i]) source_company_id = parser_db_util.save_company_standard(r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name(r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name(source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = [] artifacts.extend(r["artifacts"]) logger.info(json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) # # # parser_db_util.delete_funding(source_company_id)
def process(): logger.info("xtecher_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) if r == 0: parser_db_util.update_processed(item["_id"]) logger.info("missing website and companyName, processed %s", item["url"]) continue logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) if r.has_key('fakeName'): parser_db_util.save_source_company_name( source_company_id, r["fakeName"], 12020) else: parser_db_util.save_source_company_name( source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, r) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) # parser_db_util.delete_funding(source_company_id) # flag=parseFinance_save(source_company_id,item, download_crawler) flag = True if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack something: %s", item["url"]) # break # start += 1000 # todo if len(items) == 0: break logger.info("xtecher_company_parser end.")
def insert(name): name = name.replace("(开业)","") sourceId = util.md5str(name) sid = parser_db_util.save_company_fullName(name,13097,sourceId) logger.info("sid:%s->sourceId:%s",sid, sourceId) parser_db_util.save_source_company_name(sid, name, 12010)
def process(): logger.info('crunchbase_company_parser begin ...') start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 500) # mongo = db.connect_mongo() # collection = mongo.raw.projectdata # items = list(collection.find({"_id" : ObjectId("5b02a14fdeb4717184810e22")})) for item in items: if item is None: continue try: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder, indent=2)) # source_company (2010 running) source_company_id = parser_db_util.save_company_standard( r, download_crawler) logger.info('%s:%s' % (item['name'], source_company_id)) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) # source_company_name (12020 shortname) parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder, indent=2)) if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \ and len(artifacts) == 0: parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("missing all stuff, processed %s", item["url"]) continue # source_artifact (4010 website) parser_db_util.save_artifacts_standard(source_company_id, artifacts) # source_member and source_company_member_rel(5010 ceo) parseMember_save(source_company_id, item, download_crawler) parser_db_util.delete_funding(source_company_id) # source_funding and source_funding_investor_rel (10020 vc) parseFinance_save(source_company_id, item, download_crawler) except Exception, E: logger.info(E) pass parser_db_util.update_processed(item["_id"]) logger.info("processed %s" % item["url"]) # break if len(items) == 0: break logger.info('parser end.') return
def process(): logger.info("36kr_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) # items = [parser_db_util.find_process_one(SOURCE,TYPE,83700389)] for item in items: try: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) if r["fullName"] is not None: parser_db_util.save_source_company_name( source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \ and len(artifacts) == 0: parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("missing all stuff, processed %s", item["url"]) continue parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item, download_crawler) # parser_db_util.delete_funding(source_company_id) flag = parseFinance_save(source_company_id, item, download_crawler) flag = True except Exception, E: logger.info(E) pass # if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) # else: # logger.info("lack something: %s", item["url"]) #break #break if len(items) == 0: break
def process(): logger.info("36kr_company_parser begin...") start = 0 while True: items = parser_db_util.find_all_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) if r["status"] == "INIT": parser_db_util.update_active(SOURCE, item["key"], 'N') #parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) continue parser_db_util.update_active(SOURCE, item["key"], None) sc = parser_db_util.get_source_company_by_source_and_sourceid( SOURCE, item["key"]) if sc is None: source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) parser_db_util.save_source_company_name( source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parser_db_util.delete_funding(source_company_id) flag = parseFinance_save(source_company_id, item, download_crawler) if item["content"].has_key("founders") and item["content"][ "founders"]["data"].has_key("data"): parseMember_save( source_company_id, 5010, item["content"]["founders"]["data"]["data"], download_crawler) if item["content"].has_key("employees") and item["content"][ "employees"]["data"].has_key("data"): parseMember_save( source_company_id, 5030, item["content"]["employees"]["data"]["data"], download_crawler) if item["content"].has_key("former_members") and item[ "content"]["former_members"]["data"].has_key("data"): parseMember_save( source_company_id, 5040, item["content"]["former_members"]["data"]["data"], download_crawler) # if flag: # parser_db_util.update_processed(item["_id"]) # logger.info("processed %s" ,item["url"]) # else: # logger.info("lack somethin: %s", item["url"]) #break start += 1000 if len(items) == 0: break logger.info("36kr_company_parser end.")
def process(): logger.info("lagou_company_parser begin...") bnames = get_blacklist() while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 1000) # items = [parser_db_util.find_process_one(SOURCE, TYPE, 109625)] for item in items: r = parse_company(item) #if r is None: # continue if r.has_key("name") and r["name"].strip() != "": for bname in bnames: if r["name"].find(bname) >= 0: logger.info("黑名单") r["status"] = "No_Name" break if r["status"] == "No_Name": parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("processed %s with no data", item["url"]) continue logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) logger.info("sourceCompanyId : %s", source_company_id) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) if len(r["name"]) < len(r["fullName"]): parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) artifacts = [] artifacts.extend(r["artifacts"]) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) #artifact provided in lagou do not have any links, ignore that #artifacts = parse_artifact(source_company_id, item) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item) parserDevelop_save(source_company_id, item) # job = parser_db_util.find_process_one(SOURCE,36010, item["key_int"]) # if job: # source_jobs = lagou_job_parser.parse_companyjobs_save(source_company_id, job) # if len(source_jobs) > 0: # parser_db_util.save_jobs_standard(source_jobs) # parser_db_util.update_processed(job["_id"]) parser_db_util.update_processed(item["_id"]) #exit() if len(items) == 0: break #break logger.info("lagou_company_parser end.")
def process(sourceId=0): logger.info("evervc_company_parser begin...") start = 0 while True: if sourceId > 0: items = [parser_db_util.find_process_one(SOURCE, TYPE, sourceId)] else: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) if len(r["name"]) < len( r["fullName"] ) or r['fullName'] is None or r["fullName"] == '': parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, r) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item, download_crawler) parser_db_util.delete_funding(source_company_id) ##?? flag = parseFinance_save(source_company_id, item, r['sourceId'], download_crawler) flag = True if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack something: %s", item["url"]) # break # start += 1000 # todo if len(items) == 0 or sourceId > 0: break logger.info("evervc_company_parser end.")