def decompose(company_id, hard=True): conn = db.connect_torndb() company = conn.get("select * from company where id=%s", company_id) scs = list( conn.query( "select * from source_company where (active is null or active='Y') and (source is not null and source != 13002 and (source < 13100 or source >= 13110)) and companyStatus!=2020 and companyId=%s order by source", company_id)) conn.close() if len(scs) < 2: logger.info( "Company : %s has one active source company, no need decompose", company_id) return True fullName = company["fullName"] name = company["name"] description = company["description"] # init crawler beian_links_crawler = beian_links.BeianLinksCrawler() icp_chinaz_crawler = icp_chinaz.IcpchinazCrawler() screenshot_crawler = screenshot_website.phantomjsScreenshot() for sc in scs: company_info_expand.expand_source_company(sc["id"], beian_links_crawler, icp_chinaz_crawler, screenshot_crawler) company_aggregator_dev.aggregator(sc) return True
def decompose(company_id, hard=True): conn = db.connect_torndb() company = conn.get("select * from company where id=%s", company_id) scs = list(conn.query( "select * from source_company where (active is null or active='Y') and (source is not null and source != 13002 and (source < 13100 or source >= 13110)) and companyStatus!=2020 and companyId=%s order by source", company_id)) conn.close() if len(scs) < 2: logger.info("Company : %s has one active source company, no need decompose", company_id) return True fullName = company["fullName"] name = company["name"] description = company["description"] # init crawler beian_links_crawler = beian_links.BeianLinksCrawler() icp_chinaz_crawler = icp_chinaz.IcpchinazCrawler() screenshot_crawler = screenshot_website.phantomjsScreenshot() reserve_sc = None for sc in scs: logger.info("source company: %s, source: %s, sourceId: %s", sc["id"], sc["source"], sc["sourceId"]) if sc["name"].strip() != "" and sc["name"] == name: # logger.info("Reserve source company: %s, %s for company: %s, %s", sc["id"], sc["name"], company["id"], company["name"]) reserve_sc = sc break # update_column(company,sc) # delete_old_data(company_id) # company_info_expand.expand_source_company(sc["id"], beian_links_crawler, icp_chinaz_crawler,screenshot_crawler) # set_processStatus_zero(company_id, sc["id"]) # company_aggregator.aggregator(sc) # return True # #Must find one sc for decompose # #if no source_company can match company # sc_ids = [str(sc["id"]) for sc in scs if sc.has_key("id")] # logger.info("Can not locate source companys (%s) for company: %s", sc_ids, company_id) # return False if reserve_sc is None: reserve_sc = scs[0] logger.info("Reserve source company: %s, %s for company: %s, %s", reserve_sc["id"], reserve_sc["name"], company["id"], company["name"]) update_column(company,reserve_sc) delete_old_data(company_id) company_info_expand.expand_source_company(reserve_sc["id"], beian_links_crawler, icp_chinaz_crawler,screenshot_crawler) set_processStatus_zero(company_id, reserve_sc["id"], hard) for sc in scs: set_funding_processStatus(sc["id"]) company_aggregator.aggregator(reserve_sc) return True
def expand(): #init crawler beian_links_crawler = beian_links.BeianLinksCrawler() icp_chinaz_crawler = icp_chinaz.IcpchinazCrawler() screenshot_crawler = screenshot_website.phantomjsScreenshot() download_crawler_itjuzi = download.DownloadCrawler(max_crawl=200, timeout=10) download_crawler_kr36 = download.DownloadCrawler(use_proxy=False) download_crawler_lagou = download.DownloadCrawler(use_proxy=True) download_crawler = download.DownloadCrawler() while True: # gevent -> list of source_companies if len(COMPANIES) == 0: return sc = COMPANIES.pop(0) source = sc["source"] sourceId = sc["sourceId"] # company_info_expand_mongo.expand_source_company(source, sourceId, beian_links_crawler, icp_chinaz_crawler, screenshot_crawler) if source == 13030: diff_sourceCompanyId = check_expand_diff.check_diff( source, sourceId, download_crawler_itjuzi) elif source == 13020: diff_sourceCompanyId = check_expand_diff.check_diff( source, sourceId, download_crawler_kr36) elif source == 13050: diff_sourceCompanyId = check_expand_diff.check_diff( source, sourceId, download_crawler_lagou) else: diff_sourceCompanyId = check_expand_diff.check_diff( source, sourceId, download_crawler) logger.info("Source: %s, sourceId: %s, Diff: %s", source, sourceId, diff_sourceCompanyId) #Set processStatus in mysql and mongo mongo = db.connect_mongo() collection_source_company = mongo.source.company collection_source_company.update_one( { "source": source, "sourceId": sourceId }, {'$set': { "processStatus": 1 }}) mongo.close() if diff_sourceCompanyId is not None: # #Set recommendIds # # insert audit_source_company # parser_mysql_util.insert_audit_source_company(diff_sourceCompanyId) # parser_mysql_util.update_db_processStatus(source, sourceId, 1) pass
os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../crawler/screenshot')) import screenshot_website sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../corporate')) import company_info_expand # import company_aggregator import company_aggregator_new import company_aggregator_baseinfo import corporate_aggregator import company_replacement beian_links_crawler = beian_links.BeianLinksCrawler() icp_chinaz_crawler = icp_chinaz.IcpchinazCrawler() screenshot_crawler = screenshot_website.phantomjsScreenshot() #logger loghelper.init_logger("corporate_util", stream=True) logger = loghelper.get_logger("corporate_util") def insert_company(name, fullName, aliases): conn = db.connect_torndb() sql = "insert company(code,name,fullName,createTime,modifyTime,active) \ values(%s,%s,%s,now(),now(),'P')" code = company_aggregator_baseinfo.get_company_code(name)