def process(): logger.info("36kr_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) # items = [parser_db_util.find_process_one(SOURCE,TYPE,9258)] for item in items: # try: source_company_id = parser_db_util.get_company(SOURCE, item["key"]) logger.info("sourcecid: %s", source_company_id) parseMember_save(source_company_id["id"], item, download_crawler) # except Exception, E: # logger.info(E) # pass # if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) conn = db.connect_torndb() conn.update( "update source_company set processStatus=1 where id=%s", source_company_id["id"]) conn.close() # else: # logger.info("lack something: %s", item["url"]) #break # break if len(items) == 0: break logger.info("36kr_company_parser end.")
def process(): logger.info("36kr_investor_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) # items = [parser_db_util.find_process_one(SOURCE,TYPE,18)] for item in items: r = parse_investor(item) logger.info(json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_investor_id = parser_db_util.save_investor_standard_new(r, download_crawler) if len(r["addresses"]) > 0 : parser_db_util.save_investor_contact_standard(source_investor_id, r["addresses"]) parseMember_save(source_investor_id, item, download_crawler) parser_db_util.update_processed(item["_id"]) logger.info("processed %s" ,item["url"]) # break # break if len(items) == 0: break logger.info("36kr_investor_parser end.")
def process(): logger.info("itjuzi_news_parser begin...") items = parser_db_util.find_process(SOURCE, TYPE) for item in items: logger.info(item["key_int"]) logger.info(item["url"]) flag = parser(item) if flag: parser_db_util.update_processed(item["_id"]) #break logger.info("itjuzi_news_parser end.")
def process(): logger.info("Chuangyepu_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) #items = [parser_db_util.find_process_one(SOURCE, TYPE, 1)] for item in items: #if item['key_int'] != 1: # continue r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) if r["status"] == "No_Data" or r["status"] == "No_Name": parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("No infos for %s", item["url"]) exit() continue source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) logger.info("source_company_id=%s", source_company_id) artifacts = [] artifacts.extend(r["artifacts"]) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parser_db_util.delete_funding(source_company_id) flag = parseFinance_save(source_company_id, r['fundings'], download_crawler) if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack something: %s", item["url"]) exit() break logger.info("Chuangyepu_company_parser end.")
def process(): logger.info("itjuzi_investorfirm_parser begin...") items = parser_db_util.find_process(SOURCE, TYPE) for item in items: logger.info(item["key"]) logger.info(item["url"]) r = parser(item) if r is None: continue parser_db_util.save_investfirm(r, SOURCE, download_crawler) parser_db_util.update_processed(item["_id"]) logger.info("itjuzi_investorfirm_parser end.")
def process(): logger.info("itjuzi_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) # items = [parser_db_util.find_process_one(SOURCE, TYPE, 33045986)] for item in items: logger.info(item["url"]) r = parse_base(item) if r is None: continue source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["shortName"],12020) parser_db_util.save_source_company_name(source_company_id, r["productName"],12020) if r["fullName"] is not None: parser_db_util.save_source_company_name(source_company_id, r["fullName"],12010) main_company_name = name_helper.get_main_company_name(r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name(source_company_id, main_company_name,12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(item) flag = False if len(artifacts) > 0: flag = True artifacts.extend(r["artifacts"]) logger.info(artifacts) parser_db_util.save_artifacts(source_company_id, artifacts) footprints = parse_footprint(item) parser_db_util.save_footprints(source_company_id, footprints) # members = parse_member(item) # parser_db_util.save_member_rels(source_company_id, members, SOURCE) parseMember_save(source_company_id, item, download_crawler) parser_db_util.update_processed(item["_id"]) #if flag: # break start += 1000 if len(items) == 0: break logger.info("itjuzi_company_parser end.")
def process(): logger.info("Chuangyepu_investfirm_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_investor(item) if r is not None: parser_db_util.save_investor_standard(r, download_crawler) parser_db_util.update_processed(item["_id"]) #break if len(items) == 0: break logger.info("Chuangyepu_investfirm_parser end.")
def process(): logger.info("Demo8_next_parser begin...") items = parser_db_util.find_process(SOURCE, TYPE) for item in items: logger.info(item["url"]) r = parse_base(item) if r is None: continue #logger.info(r) source_company_id = parser_db_util.save_company(r, SOURCE) logger.info("source_company_id=%s", source_company_id) parser_db_util.save_company_score(source_company_id, r["score"]) parser_db_util.save_artifacts(source_company_id, r["artifacts"]) parser_db_util.update_processed(item["_id"]) #break logger.info("Demo8_next_parser end.")
def process(): logger.info("itjuzi_next_parser begin...") items = parser_db_util.find_process(SOURCE, TYPE) for item in items: logger.info(item["url"]) r = parse_base(item) if r is None: continue #logger.info(r) source_company_id = parser_db_util.save_company( r, SOURCE, download_crawler) logger.info("source_company_id=%s", source_company_id) parser_db_util.save_company_score(source_company_id, r["score"]) artifacts = [] for artifact in r["artifacts"]: link = artifact["link"] type, app_market, app_id = url_helper.get_market(link) if type is None: continue if type == 4040 or type == 4050: if app_id is None: continue artifact["type"] = type artifact["domain"] = app_id artifacts.append(artifact) parser_db_util.save_artifacts(source_company_id, artifacts) parser_db_util.update_processed(item["_id"]) #break logger.info("itjuzi_next_parser end.")
def process(): logger.info("36kr_next_parser begin...") items = parser_db_util.find_process(SOURCE, TYPE) for item in items: logger.info(item["url"]) r = parse_base(item) if r is None: continue #logger.info(r) try: source_company_id = parser_db_util.save_company(r, SOURCE, download_crawler) logger.info("source_company_id=%s", source_company_id) parser_db_util.save_company_score(source_company_id, r["score"]) parser_db_util.save_artifacts(source_company_id, r["artifacts"]) parser_db_util.update_processed(item["_id"]) #break except Exception,ex: logger.info(ex) continue
def process(): logger.info("itjuzi_funding_parser2 begin...") items = parser_db_util.find_process(SOURCE, TYPE) # items = [parser_db_util.find_process_one(SOURCE, TYPE, 9551657)] for item in items: logger.info(item["url"]) f = parse(item) if f is None: continue if f == -1: parser_db_util.update_processed(item["_id"]) continue flag, source_funding_id = parser_db_util.save_funding(f, 13030) if flag: # pass parser_db_util.update_processed(item["_id"]) # break logger.info("itjuzi_funding_parser2 end.") logger.info(nokeys)
def process(): skip = 0 limit = 1000 num = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, skip, 1000) #items = [parser_db_util.find_process_one(SOURCE, TYPE, 2310299181)] #items = [parser_db_util.find_process_one(SOURCE, TYPE, 1257527760)] skip += limit finish = True for c in items: #finish = False num += 1 if c.has_key("exist") and c["exist"] is False: logger.info(c["key"]) parser_db_util.update_processed(c["_id"]) if c["content"] is None: logger.info(c["key"]) parser_db_util.update_processed(c["_id"]) continue if c["content"]["data"] is None: logger.info(c["key"]) parser_db_util.update_processed(c["_id"]) continue base = c["content"]["data"]["baseInfo"] if base.get("regStatus") is None: logger.info(c["key"]) parser_db_util.update_processed(c["_id"]) continue logger.info("%s: %s" % (num, c["key"])) gongshang = { "name": base["name"], "regCapital": base.get("regCapital"), "industry": base.get("industry"), "regInstitute": base.get("regInstitute"), "establishTime": from1970todate(base.get("estiblishTime")), "base": base.get("base"), "regNumber": base.get("regNumber"), "regStatus": base.get("regStatus"), "fromTime": from1970todate(base.get("fromTime")), "toTime": from1970todate(base.get("toTime")), "businessScope": base.get("businessScope"), "regLocation": base.get("regLocation"), "companyOrgType": base.get("companyOrgType"), "legalPersonId": base.get("legalPersonId"), "legalPersonName": base.get("legalPersonName") } investors = [] if c["content"]["data"].has_key("investorList"): investorlist = c["content"]["data"]["investorList"] #logger.info(len(investorlist)) for i in investorlist: investor_info = {} investor_info["type"] = investor_type_map.get( i.get("type"), "") investor_info["name"] = i.get("name") investors.append(investor_info) members = [] if c["content"]["data"].has_key("staffList"): memberlist = c["content"]["data"]["staffList"] for m in memberlist: member_info = {} member_info["name"] = m.get("name") member_info["position"] = ",".join( list(set(m.get("typeJoin")))) members.append(member_info) changinfo = [] if c["content"]["data"].has_key("comChanInfoList"): changinfo = c["content"]["data"]["comChanInfoList"] invests = [] if c["content"]["data"].has_key("investList"): investlist = c["content"]["data"]["investList"] for v in investlist: if not v.has_key("name"): continue data = {"name": v["name"]} invests.append(data) gongshang["members"] = members gongshang["investors"] = investors gongshang["changeInfo"] = changinfo gongshang["invests"] = invests logger.info( json.dumps(gongshang, ensure_ascii=False, cls=util.CJsonEncoder)) save_collection_goshang(collection_goshang, gongshang) save_collection_goshang_his(collection_goshang_history, gongshang) parser_db_util.update_processed(c["_id"]) logger.info("processed %s", c["key"]) if len(items) == 0: break
def process(): skip = 0 limit = 1000 num = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, limit) # items = [parser_db_util.find_process_one_key(SOURCE, TYPE, u'哎哎信息科技(上海)有限公司')] # items = [parser_db_util.find_process_one_key(SOURCE, TYPE, u'行吟信息科技(上海)有限公司')] # items = [parser_db_util.find_process_one_key(SOURCE, TYPE, u'深圳市加推科技有限公司')] # skip += limit for c in items: num += 1 logger.info("%s: %s" % (num, c["key"])) gongshang = {'name': c["key"]} if c['content'].has_key('IC_info') and len( c['content']['IC_info']) > 0: base = c['content']['IC_info'] def get_date(key): if base.get(key) is not None and base.get(key) != "-": try: result = datetime.datetime.strptime( base.get(key), '%Y-%m-%d') except: result = None else: result = None return result toTime = get_date('term_end') fromTime = get_date('term_start') try: address = base.get("addresses")[0]['address'] except: address = None baseInfo = { "name": base["name"], "regCapital": base.get("regist_capital"), # "industry": base.get("industry"), # "regInstitute": base.get("regInstitute"), "establishTime": get_date('start_date'), # "base": base.get("base"), "regNumber": base.get("reg_no"), "regStatus": base.get("status"), "fromTime": fromTime, "toTime": toTime, "businessScope": base.get("scope"), "regLocation": address, "companyOrgType": base.get("kind"), # "legalPersonId": base.get("legalPersonId"), "legalPersonName": base.get("legal_person") } # gongshang.update(baseInfo) record = collection_goshang.find_one({"name": c["key"]}) for key in baseInfo: if record is None: gongshang[key] = baseInfo[key] else: if baseInfo[key] is None and record.has_key(key): logger.info("%s is None, don't update" % key) else: gongshang[key] = baseInfo[key] else: record = collection_goshang.find_one({"name": c["key"]}) if record is None: logger.info( "No gongshang data before for this missing registinfo company: %s", c["key"]) parser_db_util.update_processed(c["_id"]) continue if c['content'].has_key('partners') and len( c['content']['partners']) > 0: investors = [] investorlist = c["content"]["partners"] # logger.info(len(investorlist)) real_capitals_total, should_capitals_total = 0, 0 for i in investorlist: real_capitals = i.get('real_capitals') should_capitals = i.get('should_capitals') if real_capitals: amount = real_capitals[0].get('amount', -666) if amount != -666 and amount != '-' and amount.strip( ) not in [u'万人民币', u'万美元']: if amount.find(u'万') < 0: continue amount = amount.split(u'万')[0] real_capitals_total += float(amount) else: logger.info( '%s has no capital amount, stop calculating rate.', c['key']) real_capitals_total = -999999999 if should_capitals: amount = should_capitals[0].get('amount', -666) if amount != -666 and amount != '-' and amount.strip( ) not in [u'万人民币', u'万美元']: if amount.find(u'万') < 0: continue amount = amount.split(u'万')[0] should_capitals_total += float(amount) else: logger.info( '%s has no capital amount, stop calculating rate.', c['key']) should_capitals_total = -999999999 break for i in investorlist: investor_info = {} investor_info["type"] = i.get("kind") investor_info["name"] = i.get("name").replace("(", "(").replace( ")", ")") real_capitals = i.get("real_capitals") for capital in real_capitals: if capital.has_key('amount') and capital[ 'amount'].strip() not in [u'万人民币', u'万美元']: if capital['amount'].find(u'万') < 0: continue amount = capital['amount'].split(u'万')[0] if amount != '-': amount = float(amount) rate = '%s%%' % (int( round(amount / real_capitals_total, 2) * 100)) if real_capitals_total > 0 else '-' else: rate = '-' capital['rate'] = rate investor_info["real_capitals"] = real_capitals should_capitals = i.get("should_capitals") for capital in should_capitals: if capital.has_key('amount') and capital[ 'amount'].strip() not in [u'万人民币', u'万美元']: if capital['amount'].find(u'万') < 0: continue amount = capital['amount'].split(u'万')[0] if amount != '-': amount = float(amount) rate = '%s%%' % (int( round(amount / should_capitals_total, 2) * 100)) if should_capitals_total > 0 else '-' else: rate = '-' capital['rate'] = rate investor_info["should_capitals"] = should_capitals investors.append(investor_info) if investor_info["name"] is not None and investor_info[ "name"] != '' and ( investor_info["type"].find('企业') >= 0 or investor_info["type"].find('公司') >= 0): add_gongshang_name(investor_info["name"]) gongshang["investors"] = investors members = [] if c["content"].has_key("managers") and len( c['content']['managers']) > 0: memberlist = c["content"]["managers"] for m in memberlist: member_info = {} member_info["name"] = m.get("name") member_info["position"] = m.get("position") # member_info["position"] = ",".join(list(set(m.get("POSITION")))) members.append(member_info) gongshang["members"] = members changinfo = [] if c["content"].has_key("change_records") and len( c['content']['change_records']) > 0: changinfoList = c["content"]["change_records"] for change in changinfoList: change_info = {} change_info["changeTime"] = change.get("date") change_info["contentBefore"] = change.get("before") change_info["contentAfter"] = change.get("after") change_info["changeItem"] = change.get("item") changinfo.append(change_info) gongshang["changeInfo"] = changinfo else: gongshang["changeInfo"] = [] invests_new = [] if c["content"].has_key("invests") and len( c['content']['invests']) > 0: investlist = c["content"]["invests"] for invest in investlist: if invest.has_key("name") and invest["name"] is not None: invest['name'] = invest['name'].replace("(", "(").replace( ")", ")") invests_new.append(invest) add_gongshang_name(invest["name"]) gongshang["invests_new"] = invests_new if c["content"].has_key("contact") and len( c['content']['contact']) > 0: gongshang['contact'] = c['content']['contact'] if len(gongshang) == 1: logger.info('no content:%s', c["key"]) else: try: logger.info( json.dumps(gongshang, ensure_ascii=False, cls=util.CJsonEncoder)) except: pass save_collection_goshang(collection_goshang, gongshang) save_collection_goshang_his(collection_goshang_history, gongshang) parser_db_util.update_processed(c["_id"]) logger.info("processed %s", c["key"]) if len(items) == 0: break
def process(): skip = 0 limit = 1000 num = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, skip, limit) # items = [parser_db_util.find_process_one(SOURCE, TYPE, 837630)] # skip += limit for c in items: num += 1 logger.info("%s: %s" % (num, c["key"])) content = c['content'] d = pq(html.fromstring(content)) name = d(':contains("公司名称")+ td').text() if c["content"] is None: logger.info('%s content is None', c["key"]) parser_db_util.update_processed(c["_id"]) continue if name is None or name == '': logger.info('%s missing fullName', c["key"]) parser_db_util.update_processed(c["_id"]) continue stockwebsite = 'http://www.szse.cn/main/marketdata/hqcx/hqlb/index.shtml?code=%s' % c['key'] listingDate = d(':contains("A股上市日期")+ td').text() listingDate = datetime.datetime.strptime(listingDate, '%Y-%m-%d') if len(listingDate) > 0 else '' website = d(':contains("公司网址")+ td').text() parserContent = { "source": SOURCE, "sourceId": int(c['key']), "stockwebsite": stockwebsite, "website": website, 'listingDate': listingDate } # content['stamp'] = datetime.datetime.strptime(re.sub('\..+', '', content['stamp']), '%Y-%m-%d %H:%M:%S') # content['baseinfo']['listingDate']=datetime.datetime.strptime(content['baseinfo']['listingDate'], '%Y%m%d') # if content.has_key('executives'): # for executive in content['executives']: # dateTransed = datetime.datetime.strptime(executive['START_TIME'], '%Y-%m-%d') # content['executives'][content['executives'].index(executive)]['START_TIME'] = dateTransed # content = parseContent(content) # parserContent.update(content.pop('baseinfo')) # content['baseinfo'] = content['baseinfo'][0] parserContent['baseinfo'] = { 'shortname': d(':contains("A股简称")+ td').text(), 'englishName': d(':contains("英文名称")+ td').text(), 'regLocation': d(':contains("注册地址")+ td').text(), 'totalStockEquity': d(':contains("A股总股本")+ td').text(), 'region': d(':contains("地 区")+ td').text(), 'province': d(':contains("省 份")+ td').text(), 'city': d(':contains("城 市")+ td').text(), 'industry': d(':contains("所属行业")+ td').text(), } parserContent['executives'] = [] parserContent['name'] = name # parserContent.update(content) logger.info(json.dumps(parserContent, ensure_ascii=False, cls=util.CJsonEncoder)) save_collection(collection, parserContent) parser_db_util.update_processed(c["_id"]) logger.info("processed %s", c["key"]) # time.sleep(1) if len(items) == 0: logger.info("no more items") break
def process(): skip = 0 limit = 1000 num = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, limit) # items = [parser_db_util.find_process_one_key(SOURCE, TYPE, u'哎哎信息科技(上海)有限公司')] # items = [parser_db_util.find_process_one_key(SOURCE, TYPE, u'北京真格天投股权投资中心(有限合伙)')] # skip += limit for c in items: num += 1 logger.info("%s: %s" % (num, c["key"])) gongshang = {'name': c["key"]} if c['content'].has_key('A1'): base = pq(html.fromstring(c['content']['A1'].decode("utf-8"))) item = base('BASIC ITEM') if len(item) > 0: base = base(item) def getItem(key): if base(key).text() != '': return base(key).text() else: return None baseInfo = { "name": getItem('ENTNAME'), "regCapital": getItem('REGCAP'), "industry": getItem('INDUSTRYPHY'), "regInstitute": getItem('REGORG'), "establishTime": datetime.datetime.strptime(getItem('ESDATE'), '%Y-%m-%d') if getItem( 'ESDATE') is not None else None, "base": getItem('REGORGPROVINCE'), "regNumber": getItem('REGNO'), "regStatus": getItem('ENTSTATUS'), "fromTime": datetime.datetime.strptime(getItem('OPFROM'), '%Y-%m-%d') if getItem( 'OPFROM') is not None else None, "toTime": datetime.datetime.strptime(getItem('OPTO'), '%Y-%m-%d') if getItem( 'OPTO') is not None else None, "businessScope": getItem('CBUITEM'), "regLocation": getItem('OPLOC'), "companyOrgType": getItem('ENTTYPE'), # "legalPersonId": getItem('REGORGPROVINCE'), "legalPersonName": getItem('FRNAME') } # gongshang.update(baseInfo) record = collection_goshang.find_one({"name": c["key"]}) for key in baseInfo: if record is None: gongshang[key] = baseInfo[key] else: if baseInfo[key] is None and record.has_key(key): logger.info("%s is None, don't update" % key) else: gongshang[key] = baseInfo[key] if c['content'].has_key('B1'): investors = [] htmlRaw = pq(html.fromstring(c['content']['B1'].decode("utf-8"))) item = htmlRaw('SHAREHOLDER ITEM') if len(item) > 0: for investor in item: i = htmlRaw(investor) investor_info = {} # investor_info["type"] = i('') investor_info["name"] = i("SHANAME").text() investors.append(investor_info) gongshang["investors"] = investors members = [] if c["content"].has_key("B3") > 0: htmlRaw = pq(html.fromstring(c['content']['B3'].decode("utf-8"))) item = htmlRaw('PERSON ITEM') if len(item) > 0: for member in item: def getItem(key): if m(key).text() != '': return m(key).text() else: return None m = htmlRaw(member) member_info = {} member_info["name"] = getItem("PERNAME") member_info["position"] = getItem("POSITION") # member_info["position"] = ",".join(list(set(m.get("POSITION")))) members.append(member_info) gongshang["members"] = members changinfo = [] if c["content"].has_key("A2") > 0: htmlRaw = pq(html.fromstring(c['content']['A2'].decode("utf-8"))) item = htmlRaw('ALTER ITEM') if len(item) > 0: for change in item: def getItem(key): if change(key).text() != '': return change(key).text() else: return None change = htmlRaw(change) change_info = {} change_info["changeTime"] = getItem("ALTDATE") change_info["contentBefore"] = getItem("ALTBE") change_info["contentAfter"] = getItem("ALTAF") change_info["changeItem"] = getItem("ALTITEM") changinfo.append(change_info) gongshang["changeInfo"] = changinfo invests_new = [] if c["content"].has_key("B7"): htmlRaw = pq(html.fromstring(c['content']['B7'].decode("utf-8"))) item = htmlRaw('ENTINV ITEM') if len(item) > 0: for i in item: invest = htmlRaw(i) if invest("ENTNAME") and invest("ENTNAME").text() != '': invests_new.append(invest("ENTNAME").text()) gongshang["invests_new"] = invests_new parser_db_util.update_processed(c["_id"]) if len(gongshang) == 1: logger.info('no content:%s', c["key"]) else: logger.info(json.dumps(gongshang, ensure_ascii=False, cls=util.CJsonEncoder)) save_collection_goshang(collection_goshang, gongshang) save_collection_goshang_his(collection_goshang_history, gongshang) logger.info("processed %s", c["key"]) if len(items) == 0: break
def process(): logger.info("xtecher_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) if r == 0: parser_db_util.update_processed(item["_id"]) logger.info("missing website and companyName, processed %s", item["url"]) continue logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["name"], 12020) if r.has_key('fakeName'): parser_db_util.save_source_company_name( source_company_id, r["fakeName"], 12020) else: parser_db_util.save_source_company_name( source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, r) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) # parser_db_util.delete_funding(source_company_id) # flag=parseFinance_save(source_company_id,item, download_crawler) flag = True if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack something: %s", item["url"]) # break # start += 1000 # todo if len(items) == 0: break logger.info("xtecher_company_parser end.")
def process(sourceId=0): logger.info("evervc_company_parser begin...") start = 0 while True: if sourceId > 0: items = [parser_db_util.find_process_one(SOURCE, TYPE, sourceId)] else: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) for item in items: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) if len(r["name"]) < len( r["fullName"] ) or r['fullName'] is None or r["fullName"] == '': parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, r) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item, download_crawler) parser_db_util.delete_funding(source_company_id) ##?? flag = parseFinance_save(source_company_id, item, r['sourceId'], download_crawler) flag = True if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) else: logger.info("lack something: %s", item["url"]) # break # start += 1000 # todo if len(items) == 0 or sourceId > 0: break logger.info("evervc_company_parser end.")
def process(): logger.info("36kr_company_parser begin...") start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, start, 1000) # items = [parser_db_util.find_process_one(SOURCE,TYPE,83700389)] for item in items: try: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) if r["fullName"] is not None: parser_db_util.save_source_company_name( source_company_id, r["fullName"], 12010) main_company_name = name_helper.get_main_company_name( r["fullName"]) if main_company_name != r["fullName"]: parser_db_util.save_source_company_name( source_company_id, main_company_name, 12010) logger.info("source_company_id=%s", source_company_id) artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \ and len(artifacts) == 0: parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("missing all stuff, processed %s", item["url"]) continue parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item, download_crawler) # parser_db_util.delete_funding(source_company_id) flag = parseFinance_save(source_company_id, item, download_crawler) flag = True except Exception, E: logger.info(E) pass # if flag: parser_db_util.update_processed(item["_id"]) logger.info("processed %s", item["url"]) # else: # logger.info("lack something: %s", item["url"]) #break #break if len(items) == 0: break
def process(): logger.info("lagou_company_parser begin...") bnames = get_blacklist() while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 1000) # items = [parser_db_util.find_process_one(SOURCE, TYPE, 109625)] for item in items: r = parse_company(item) #if r is None: # continue if r.has_key("name") and r["name"].strip() != "": for bname in bnames: if r["name"].find(bname) >= 0: logger.info("黑名单") r["status"] = "No_Name" break if r["status"] == "No_Name": parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("processed %s with no data", item["url"]) continue logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder)) source_company_id = parser_db_util.save_company_standard( r, download_crawler) logger.info("sourceCompanyId : %s", source_company_id) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) if len(r["name"]) < len(r["fullName"]): parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) parser_db_util.save_source_company_name(source_company_id, r["fullName"], 12010) artifacts = [] artifacts.extend(r["artifacts"]) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder)) #artifact provided in lagou do not have any links, ignore that #artifacts = parse_artifact(source_company_id, item) parser_db_util.save_artifacts_standard(source_company_id, artifacts) parseMember_save(source_company_id, item) parserDevelop_save(source_company_id, item) # job = parser_db_util.find_process_one(SOURCE,36010, item["key_int"]) # if job: # source_jobs = lagou_job_parser.parse_companyjobs_save(source_company_id, job) # if len(source_jobs) > 0: # parser_db_util.save_jobs_standard(source_jobs) # parser_db_util.update_processed(job["_id"]) parser_db_util.update_processed(item["_id"]) #exit() if len(items) == 0: break #break logger.info("lagou_company_parser end.")
def process(): skip = 0 limit = 1000 num = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, limit) # items = [parser_db_util.find_process_one(SOURCE, TYPE, 2310299181)] # skip += limit for c in items: num += 1 logger.info("%s: %s" % (num, c["key"])) gongshang = {'name': c["key"]} if c['content'].has_key('getRegistInfo') and len( c['content']['getRegistInfo']) > 0 and isinstance( c['content']['getRegistInfo'], list): base = c['content']['getRegistInfo'][0] if base.get("OPTO") is not None: try: toTime = datetime.datetime.strptime( base.get("OPTO"), '%Y-%m-%d') except: toTime = base.get("OPTO") else: toTime = None if base.get("OPFROM") is not None: try: fromTime = datetime.datetime.strptime( base.get("OPFROM"), '%Y-%m-%d') except: fromTime = base.get("OPFROM") else: fromTime = None baseInfo = { "name": base["ENTNAME"], "regCapital": base.get("REGCAP"), # "industry": base.get("industry"), # "regInstitute": base.get("regInstitute"), "establishTime": datetime.datetime.strptime(base.get("ESDATE"), '%Y-%m-%d') if base.get("ESDATE") else None, # "base": base.get("base"), "regNumber": base.get("REGNO"), "regStatus": base.get("ENTSTATUS"), "fromTime": fromTime, "toTime": toTime, "businessScope": base.get("OPSCOPE"), "regLocation": base.get("DOM"), "companyOrgType": base.get("ENTTYPE"), # "legalPersonId": base.get("legalPersonId"), "legalPersonName": base.get("FRNAME") } gongshang.update(baseInfo) else: record = collection_goshang.find_one({"name": c["key"]}) if record is None: logger.info( "No gongshang data before for this missing registinfo company: %s", c["key"]) parser_db_util.update_processed(c["_id"]) continue if c['content'].has_key('getShareHolderInfo') and len( c['content']['getShareHolderInfo']) > 0 and isinstance( c['content']['getShareHolderInfo'], list): investors = [] investorlist = c["content"]["getShareHolderInfo"] # logger.info(len(investorlist)) for i in investorlist: investor_info = {} investor_info["type"] = i.get("INVTYPE") investor_info["name"] = i.get("SHANAME") investors.append(investor_info) gongshang["investors"] = investors members = [] if c["content"].has_key("getMainManagerInfo") and len( c['content']['getMainManagerInfo']) > 0 and isinstance( c['content']['getMainManagerInfo'], list): memberlist = c["content"]["getMainManagerInfo"] for m in memberlist: member_info = {} member_info["name"] = m.get("NAME") member_info["position"] = m.get("POSITION") # member_info["position"] = ",".join(list(set(m.get("POSITION")))) members.append(member_info) gongshang["members"] = members changinfo = [] if c["content"].has_key("getRegisterChangeInfo") and len( c['content']['getRegisterChangeInfo']) > 0 and isinstance( c['content']['getRegisterChangeInfo'], list): changinfoList = c["content"]["getRegisterChangeInfo"] for change in changinfoList: change_info = {} change_info["changeTime"] = change.get("ALTDATE") change_info["contentBefore"] = change.get("ALTBE") change_info["contentAfter"] = change.get("ALTAF") change_info["changeItem"] = change.get("ALTITEM") changinfo.append(change_info) gongshang["changeInfo"] = changinfo # invests_new = [] # if c["content"].has_key("getInvestmentAbroadInfo") and len( # c['content']['getInvestmentAbroadInfo']) > 0 and isinstance(c['content']['getInvestmentAbroadInfo'], # list): # investlist = c["content"]["getInvestmentAbroadInfo"] # for invest in investlist: # if invest.has_key("ENTNAME") and invest["ENTNAME"] is not None: # invests_new.append(invest["ENTNAME"]) # # gongshang["invests_new"] = invests_new if len(gongshang) == 1: logger.info('no content:%s', c["key"]) else: logger.info( json.dumps(gongshang, ensure_ascii=False, cls=util.CJsonEncoder)) save_collection_goshang(collection_goshang, gongshang) save_collection_goshang_his(collection_goshang_history, gongshang) parser_db_util.update_processed(c["_id"]) logger.info("processed %s", c["key"]) if len(items) == 0: break
def process(): logger.info('crunchbase_company_parser begin ...') start = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, 0, 500) # mongo = db.connect_mongo() # collection = mongo.raw.projectdata # items = list(collection.find({"_id" : ObjectId("5b02a14fdeb4717184810e22")})) for item in items: if item is None: continue try: r = parse_company(item) logger.info( json.dumps(r, ensure_ascii=False, cls=util.CJsonEncoder, indent=2)) # source_company (2010 running) source_company_id = parser_db_util.save_company_standard( r, download_crawler) logger.info('%s:%s' % (item['name'], source_company_id)) parser_db_util.delete_source_company_name(source_company_id) parser_db_util.delete_source_mainbeianhao(source_company_id) # source_company_name (12020 shortname) parser_db_util.save_source_company_name( source_company_id, r["name"], 12020) artifacts = parse_artifact(source_company_id, item) logger.info( json.dumps(artifacts, ensure_ascii=False, cls=util.CJsonEncoder, indent=2)) if (r["fullName"] is None or r["fullName"].strip() == "") and (r['description'] is None or r['description'].strip() == "") \ and len(artifacts) == 0: parser_db_util.update_active(SOURCE, item["key"], 'N') parser_db_util.update_processed(item["_id"]) logger.info("missing all stuff, processed %s", item["url"]) continue # source_artifact (4010 website) parser_db_util.save_artifacts_standard(source_company_id, artifacts) # source_member and source_company_member_rel(5010 ceo) parseMember_save(source_company_id, item, download_crawler) parser_db_util.delete_funding(source_company_id) # source_funding and source_funding_investor_rel (10020 vc) parseFinance_save(source_company_id, item, download_crawler) except Exception, E: logger.info(E) pass parser_db_util.update_processed(item["_id"]) logger.info("processed %s" % item["url"]) # break if len(items) == 0: break logger.info('parser end.') return
def process(): skip = 0 limit = 1000 num = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, skip, limit) # items = [parser_db_util.find_process_one(SOURCE, TYPE, 837630)] # skip += limit for c in items: num += 1 logger.info("%s: %s" % (num, c["key"])) if c['content'].has_key("baseinfo") is False: c['content']["baseinfo"] = c['content']["bbseinfo"] if c["content"] is None or (c['content'].has_key('ret') and c['content']['ret'] == -999): logger.info('%s content is None', c["key"]) parser_db_util.update_processed(c["_id"]) continue if c['content']["baseinfo"].has_key("name") is False: logger.info('%s missing fullName', c["key"]) parser_db_util.update_processed(c["_id"]) continue content = c['content'] stockwebsite = 'http://www.neeq.com.cn/nq/detailcompany.html?companyCode=%s&typeId=1&typename=G' % c[ 'key'] content['stamp'] = datetime.datetime.strptime( re.sub('\..+', '', content['stamp']), '%Y-%m-%d %H:%M:%S') content['baseinfo']['listingDate'] = datetime.datetime.strptime( content['baseinfo']['listingDate'], '%Y%m%d') if content.has_key('topTenHolders'): for holder in content['topTenHolders']: dateTransed = datetime.datetime.strptime( holder['date'], '%Y-%m-%d') content['topTenHolders'][content['topTenHolders'].index( holder)]['date'] = dateTransed parserContent = { "source": SOURCE, "sourceId": int(c['key']), "stockwebsite": stockwebsite, # "website":content['baseinfo']['website'], "name": content['baseinfo']['name'], "listingDate": content['baseinfo']['listingDate'], } content = parseContent(content) if content['baseinfo'].has_key('website'): parserContent["website"] = content['baseinfo']['website'] # parserContent.update(content.pop('baseinfo')) parserContent.update(content) logger.info( json.dumps(parserContent, ensure_ascii=False, cls=util.CJsonEncoder)) save_collection(collection, parserContent) parser_db_util.update_processed(c["_id"]) logger.info("processed %s", c["key"]) # time.sleep(1) if len(items) == 0: logger.info('no more new item') break
def process(): skip = 0 limit = 1000 num = 0 while True: items = parser_db_util.find_process_limit(SOURCE, TYPE, skip, limit) # items = [parser_db_util.find_process_one(SOURCE, TYPE, 837630)] # skip += limit for c in items: num += 1 if c["content"] is None or c["content"]['baseinfo'][0].has_key( "FULLNAME") is False: logger.info('%s content is None', c["key"]) parser_db_util.update_processed(c["_id"]) continue logger.info("%s: %s" % (num, c["key"])) content = c['content'] stockwebsite = 'http://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE=%s' % c[ 'key'] listingDate = datetime.datetime.strptime( content.pop('listingDate')[0]['LISTINGDATEA'], '%Y-%m-%d') if len(content['listingDate']) > 0 else None parserContent = { "source": SOURCE, "sourceId": int(c['key']), "stockwebsite": stockwebsite, "website": content['baseinfo'][0].get('WWW_ADDRESS', None), 'listingDate': listingDate } # content['stamp'] = datetime.datetime.strptime(re.sub('\..+', '', content['stamp']), '%Y-%m-%d %H:%M:%S') # content['baseinfo']['listingDate']=datetime.datetime.strptime(content['baseinfo']['listingDate'], '%Y%m%d') if content.has_key('executives'): for executive in content['executives']: dateTransed = datetime.datetime.strptime( executive['START_TIME'], '%Y-%m-%d') content['executives'][content['executives'].index( executive)]['START_TIME'] = dateTransed content = parseContent(content) # parserContent.update(content.pop('baseinfo')) content['baseinfo'] = content['baseinfo'][0] content['name'] = content['baseinfo']['FULLNAME'] content['baseinfo']['shortname'] = content['baseinfo'].pop( 'COMPANY_ABBR') parserContent.update(content) logger.info( json.dumps(parserContent, ensure_ascii=False, cls=util.CJsonEncoder)) save_collection(collection, parserContent) parser_db_util.update_processed(c["_id"]) logger.info("processed %s", c["key"]) # time.sleep(1) if len(items) == 0: logger.info("no more items") break