def patch_company_location(company_id): conn = db.connect_torndb_proxy() company1 = conn.get("select * from company where id=%s", company_id) if company1["corporateId"] is not None: corporate = conn.get("select * from corporate where id=%s", company1["corporateId"]) if corporate is not None and (corporate["locationId"] is None or corporate["locationId"] == 0): locationId = None alias0 = [{ "name": corporate["fullName"] }] if corporate["fullName"] is not None else [] aliases = conn.query( "select * from corporate_alias where corporateId=%s and " "(active is null or active ='Y') and verify='Y'", company1["corporateId"]) for alias in alias0 + aliases: logger.info(alias["name"]) loc1, loc2 = name_helper.get_location_from_company_name( alias["name"]) logger.info("%s/%s", loc1, loc2) if loc1 is not None: l = conn.get("select *from location where locationName=%s", loc1) if l: locationId = l["locationId"] break if locationId is not None: conn.update("update corporate set locationId=%s where id=%s", locationId, company1["corporateId"]) conn.close()
def patch_company_location(company_id): conn = db.connect_torndb() company1 = conn.get("select * from company where id=%s", company_id) if company1["locationId"] is None or company1["locationId"] == 0: locationId = None scs = conn.query( "select * from source_company where companyId=%s and (active is null or active='Y')", company_id) for sc in scs: if sc["locationId"] is not None and sc["locationId"] > 0: locationId = sc["locationId"] break if locationId is None: aliases = conn.query( "select * from company_alias where companyId=%s and type=12010", company_id) for alias in aliases: loc1, loc2 = name_helper.get_location_from_company_name( alias["name"]) if loc1 is not None: l = conn.get("select *from location where locationName=%s", loc1) if l: locationId = l["locationId"] break if locationId is not None: conn.update("update company set locationId=%s where id=%s", locationId, company1["id"]) conn.close()
def parse_company(item): # logger.info("parse_company") d = pq(html.fromstring(item['content'].decode("utf-8"))) company_key = item["key"] # company basic info tags = [] for tag in d('.word_list').text().split(): if tag.strip() not in tags: tags.append(tag) tags_str = ",".join(tags) logo = d('.peoimg img').attr('src') if logo: logo = logo.replace("https://", "http://") establish_date = None time_content = d('.time_content li:last-child') if d(time_content)('.upword').text().find('成立') > 0: establish_date = d(time_content)('.time_up').text() establish_date = datetime.datetime.strptime(establish_date, '%Y-%m-%d') companyName = d('.company_div h5').text() city = name_helper.get_location_from_company_name(companyName)[0] location_id = 0 if city != None: location = parser_db_util.get_location(city) if location != None: location_id = location["locationId"] # logger.info("locationid =%s",location_id) fullName = companyName.replace("_", "") fullName = name_helper.company_name_normalize(fullName) desc = d('#intro_srocll p').text() productDesc = '' website = '' for p in d('.procont_lis p'): if d(p).text().find('官网') > 0 and d(p)('a').attr('href') is not None: website = d(p)('a').attr('href') continue productDesc += d(p).text() + '\n' if desc == '' or desc is None: desc = productDesc shortName = d('.peo_center h4').text().split(':')[0].split(':')[0].split( '——')[0].split(',')[0].split('|')[0] companyResult = {} # isCompany # print companyName,company_key, ',', name_helper.name_check(companyName)[1], ',', len(website)>0 if name_helper.name_check(companyName)[1] == True: # English name if name_helper.name_check(shortName)[0] == False: pass else: cnt = 0 for s in shortName: if s in companyName: cnt += 1 if not cnt > 2: shortName = companyName else: if not len(website) > 0: return 0 else: companyResult['fakeName'] = fullName fullName = None companyResult.update({ "name": shortName, "fullName": fullName, "description": desc, "round": 0, "roundDesc": None, "companyStatus": 2010, 'fundingType': 0, "locationId": location_id, "address": None, "phone": None, "establishDate": establish_date, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": None, "subField": None, "tags": tags_str, "headCountMin": None, "headCountMax": None, "brief": None, "website": website, }) return companyResult
def parse_company(item): # logger.info("parse_company") d = pq(html.fromstring(item['content'].decode("utf-8"))) company_key = item["key"] # company basic info tags = [] for tag in d('.portfolio-user-tag .label').text().split(): if tag.strip() not in tags: tags.append(tag.strip()) tags_str = ",".join(tags) logo = 'http:' + d('.portfolio-user-photo img').attr('src') if logo: logo = logo.replace("https://", "http://") logo = logo.replace("@!msgs", "") establish_date = None companyName = d('.corp-name').text() location_id = 0 city = d('.portfolio-user-tag').text().split(' ')[0] if city != None: location = parser_db_util.get_location(city) if location is None: city = name_helper.get_location_from_company_name(companyName)[0] if city != None: location = parser_db_util.get_location(city) if location != None: location_id = location["locationId"] # logger.info("locationid =%s",location_id) fullName = companyName.replace("_", "") fullName = name_helper.company_name_normalize(fullName) # desc = d('.portfolio-corp p').text() desc = d('.portfolio-user-bio .text').text() productDesc = d('.portfolio-text').text() website = d('.user-contact a').text() if desc == '' or desc is None: desc = productDesc shortName = d('.portfolio-user-info h1').text() companyResult = {} companyResult.update({ "name": shortName, "fullName": fullName, "description": desc, "productDesc": productDesc, "round": 0, "roundDesc": None, "companyStatus": 2010, 'fundingType': 0, "locationId": location_id, "address": None, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": None, "subField": None, "tags": tags_str, "headCountMin": None, "headCountMax": None, "brief": None, "website": website, }) return companyResult
def parse_base(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip() if product_name is None or product_name.strip() == "": product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip() temps = product_name.split("/",1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: %s" % product_name) logger.info("company short name: %s" % company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","") if company_name == "暂无" or company_name == "暂未收录": company_name = "" if company_name is None or company_name.strip() == "": try: company_name = d('div.des-more> h2').text().strip() except: pass if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = name_helper.company_name_normalize(company_name) logger.info("company name: %s" % company_name) if company_short_name == "" and company_name == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","") result = util.re_get_result('(\d*)\.(\d*)',str) if result != None: (year, month) = result try: if int(month) > 12: month = "1" except: month = "1" establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d') logger.info("establish date: %s" % establish_date) locationId=0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$',str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 result = parser_db_util.get_location(city) if result != None: locationId = result["locationId"] else: result = parser_db_util.get_location(province) if result != None: locationId = result["locationId"] if locationId == 0: loc1,loc2 = name_helper.get_location_from_company_name(company_name) if loc1 is not None: result = parser_db_util.get_location(loc1) if result != None: locationId = result["locationId"] logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info("融资需求: %s" % str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) try: brief = d("h2.seo-slogan").text().strip() except: brief = "" logger.info("brief: %s" % brief) if brief.find("暂未收录"): brief = "" field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: %s" % field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: %s" % sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",") logger.info("tags: %s" % tags) desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\ replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip() logger.info("********desc: %s" % desc) #logo logo = d("div.pic >img").attr("src") #if logo: # logo = logo.replace("http://", "https://") logger.info("logo: %s", logo) # website = d('div.link-line> a').text().strip() # if website is None or website == "": # website = d('div.link-line> a.webTink').text().strip() # if website is None or website == "": # try: # logger.info("here") # website = d('div.link-line> span.weblink> a').eq(1).text().strip() # logger.info(website) # except: # pass artifacts = [] for ty in [1,2,3]: if ty == 1: was = d('div.link-line> a') else: was = d('div.link-line> span.weblink,span.webTink> a') for wa in was: webs =[] try: website = pq(wa).attr("href").strip() if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass try: website = pq(wa).text().strip() if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass # # if website=="http://%e6%9a%82%e6%97%a0": # website = "" # website = url_helper.url_normalize(website) # logger.info("website: %s" % website) # artifacts = [] for website in webs: type, app_market, app_id = url_helper.get_market(website) if type == 4010: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type":4010, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4020: domain = app_id if domain is not None: artifacts.append({ "type": 4020, "name": product_name, "desc": None, "link": website, "domain": website }) elif type == 4030: domain = app_id if domain is not None: artifacts.append({ "type": 4030, "name": product_name, "desc": None, "link": website, "domain": None }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type":4040, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type":4050, "name":product_name, "desc":desc, "link":website, "domain": domain }) #获投状态 roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip() fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("获投状态: %d, %s", fundingRound, roundStr) logger.info("") return { "shortName": company_short_name, "fullName": company_name if company_name is not None and company_name.strip() != "" else None, "productName": product_name, "description": desc, "brief": brief, "round": fundingRound, "roundDesc": roundStr, "companyStatus": company_status, "fundingType": funding_type, "locationId": locationId, "establishDate": establish_date, "logo": logo, "sourceId": company_key, "field": field, "subField": sub_field, "tags": tags, "type":41010, "artifacts":artifacts }