def get_android_domain(app_market, app_id): domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id return domain
def parse_company(item): if item is None: logger.info("here") return None logger.info("*** base ***") company_key = item["key"] html1 = item["content"] logger.info(company_key) d = pq((html.fromstring(html1.decode("utf-8")))) name = d('h1.name').text().strip() fullName = d('div.company-business> h4').text() if fullName.find("来源")>=0: fullName = fullName.split(" ")[-1] fullName = name_helper.company_name_normalize(fullName) if (name is None or name == "") or (fullName is None or fullName == ""): logger.info("here1: %s", name) return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: logger.info("here") return { "status": "No_Name", } logo = d('div.company-logo> img').attr('src') if logo.startswith("http") or logo.startswith("https") or logo.find("default") >= 0: pass else: logo = None # if logo.find("default") >= 0: # logo = None brief = None desc_text = d('div.job-sec> div.text').text() logger.info("desc: %s", desc_text) if u"该公司尚未添加公司介绍" in desc_text or desc_text == "" or len(desc_text) < 5: desc = None else: desc = desc_text.replace('公司简介:',"").replace("收起","").replace("展开","").replace(" ","").strip() field = '' stage = '' headCount = '' location = '' address = '' try: lll = d('div.info-primary> p').text().strip() if len(lll.split(" ")) == 3: field = lll.split(" ")[2] stage = lll.split(" ")[0] headCount = lll.split(" ")[1] except: pass headCount = headCount.replace("人", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 links = d('div.company-products> ul> li> div.text> div.name> a') artifacts = [] for linkp in links: link = pq(linkp)('a').attr("href") website = url_helper.url_normalize(link) logger.info("website: %s" % website) type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("zhipin") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": None, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": None, "link": website, "domain": domain }) #parser member members = [] lis = d('div.manager-list> div> ul >li> div') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('div.info-user> img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p> span.name').text() # member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p> span.job-title').text() member_desc = mem('div.item_manager_content').text() # weibo = None # if member_link is not None: # if 'weibo.com' in member_link: # weibo = member_link source_member = {'name': member_name, 'photo_url': logo_url, 'weibo': None, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None } members.append(source_member) except: pass sourceId2link = d('div.company-tab> a').eq(0).attr("href") if sourceId2link is not None and sourceId2link.find("gongsi") >=0: sourceId2 = sourceId2link.split("/")[-1].replace(".html","") else: sourceId2 = None source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": int(0), "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "sourceId2": sourceId2, "sourceUrl": "https://www.zhipin.com/gongsi/%s.html?ka=company-intro" % company_key, "field": field, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "members": members, "status": 1, "stage": 0, } return source_company
def parse_artifact(source_company_id, item): name = item['name'] logger.info('parse_artifact:%s' % name) artifacts = [] desc = '' descs = item['content']['company_base']['properties'] if descs.has_key('short_description'): desc = descs['short_description'] of = item['content']['company_base']['overview_fields2'] if of.has_key('website'): website = of['website']['value'] website = url_helper.url_normalize(website) # logger.info('website:%s'%website) if website is not None and website.find( 'twitter') == -1 and website.find( 'linkedin') == -1 and website.find('facebook') == -1: type, app_market, app_id = url_helper.get_market(website) # logger.info('type:%s---market:%s---app_id:%s'%(type,market,app_id)) if type == 4010: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) return artifacts
def parse_company(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] logger.info(company_key) d = pq(html) if html.decode("utf-8").find("这个公司的主页还在建设") >= 0: return { "status": "No_Name", } name = d('.company_main > h1 > a').text() link = d('.company_main > h1 > a').attr('href') fullName = d('.company_main > h1 > a').attr('title') fullName = name_helper.company_name_normalize(fullName) if name is None or fullName is None or (name.find("拉勾") >= 0 and company_key != "147"): return { "status": "No_Name", } if len(name) > len(fullName): name = fullName if name is None or name.strip() == "": name = fullName chinese, companycheck = name_helper.name_check(fullName) if companycheck is not True: return { "status": "No_Name", } logo = d('.top_info_wrap > img').attr('src') if logo.startswith("http") or logo.startswith("https"): pass else: logo = "http:" + logo if logo.find("logo_default") >= 0: logo = None brief = d('.company_word').text() desc_text = d('.company_intro_text').text() if u"该公司尚未添加公司介绍" in desc_text or len(desc_text) < 10: desc = None else: desc = d('.company_intro_text > .company_content').html() desc = desc.replace('<span class="text_over">展开</span>', '') soup = BeautifulSoup(desc, "lxml") raw = soup.getText() desc = raw field = '' stage = '' headCount = '' location = '' address = '' try: field = d( '#basic_container > .item_content >ul > li:eq(0) > span').text() stage = d( '#basic_container > .item_content >ul > li:eq(1) > span').text() headCount = d( '#basic_container > .item_content >ul > li:eq(2) > span').text() headCount = headCount[0:headCount.index(u'人')] location = d( '#basic_container > .item_content >ul > li:eq(3) > span').text() address = d('.con_mlist_ul > li:eq(0) > p:eq(1)').text() except: pass headCount = headCount.replace("people", "") if headCount == "少于15": min_staff = 1 max_staff = 15 else: staffarr = headCount.split('-') if len(staffarr) > 1: min_staff = staffarr[0] max_staff = staffarr[1] else: try: min_staff = int(staffarr[0].strip()) max_staff = None except: min_staff = None max_staff = None funding_type = 0 if stage == '不需要融资': stage = 0 funding_type = 8010 elif stage == '未融资': stage = 0 elif stage == '天使轮': stage = 1010 elif stage == 'A轮': stage = 1030 elif stage == 'B轮': stage = 1040 elif stage == 'C轮': stage = 1050 elif stage == 'D轮及以上': stage = 1060 elif stage == '上市公司': stage = 1110 else: stage = 0 location_id = 0 location_new = parser_mongo_util.get_location(location) if location_new != None: location_id = location_new["locationId"] #website = util.norm_url(link) website = url_helper.url_normalize(link) logger.info("website: %s" % website) artifacts = [] type, app_market, app_id = url_helper.get_market(website) if type == 4010: if item["url"] != website and website.find("lagou.com") == -1: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4020 or type == 4030: domain = None if domain is not None: artifacts.append({ "type": type, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": name, "description": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": name, "description": desc, "link": website, "domain": domain }) #parser member members = [] lis = d('.manager_list > li') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p.item_manager_name > span').text() member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p.item_manager_title').text() member_desc = mem('div.item_manager_content').text() weibo = None if member_link is not None: if 'weibo.com' in member_link: weibo = member_link source_member = { 'name': member_name, 'photo_url': logo_url, 'weibo': weibo, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None } members.append(source_member) except: pass source_company = { "name": name, "fullName": fullName if fullName is not None and fullName.strip() != "" else None, "description": desc, "brief": brief, "round": None, "roundDesc": None, "companyStatus": 2010, 'fundingType': funding_type, "locationId": int(location_id), "address": address, "phone": None, "establishDate": None, "logo": logo, "source": SOURCE, "sourceId": company_key, "sourceUrl": "https://www.lagou.com/gongsi/%s.html" % company_key, "field": field, "headCountMin": min_staff, "headCountMax": max_staff, "artifacts": artifacts, "members": members, "status": 1 } return source_company
def parse_artifact(item): if item is None: return [] artifacts = [] company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) #artifact logger.info("*** artifact ***") lis = d('ul.list-prod> li> div.on-edit-hide') for li in lis: l = pq(li) strtype = l('h4> span.tag').text().strip() #logger.info(strtype) if strtype != u"网站" and strtype != "app": continue link = l('h4> b> a').attr("href").strip() if link == "": continue domain = None type = None if strtype == u"网站": type, app_market, app_id = url_helper.get_market(link) if type == 4010: link = url_helper.url_normalize(link) flag, domain = url_helper.get_domain(link) if flag is None: continue if flag is False: domain = None if type != 4010: type, app_market, app_id = url_helper.get_market(link) if type == 4040: domain = app_id elif type == 4050: if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is None and type != 4030 and type != 4020: continue name = l('h4> b').text().strip() desc = l('p').text().strip() logger.info("type: %s, name: %s, link: %s, desc: %s" % (type, name, link, desc)) artifact = { "type": type, "name": name, "desc": desc, "link": link, "domain": domain } artifacts.append(artifact) logger.info("") return artifacts
def parse_base(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> b').clone().children().remove( ).end().text().strip() temps = product_name.split("/", 1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: %s" % product_name) logger.info("company short name: %s" % company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace( "公司全称:", "") if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = name_helper.company_name_normalize(company_name) logger.info("company name: %s" % company_name) if company_short_name == "" and company_name == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:", "") result = util.re_get_result('(\d*)\.(\d*)', str) if result != None: (year, month) = result try: if int(month) > 12: month = "1" except: month = "1" establish_date = datetime.datetime.strptime("%s-%s-1" % (year, month), '%Y-%m-%d') logger.info("establish date: %s" % establish_date) locationId = 0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$', str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 result = parser_mongo_util.get_location(city) if result != None: locationId = result["locationId"] else: result = parser_mongo_util.get_location(province) if result != None: locationId = result["locationId"] if locationId == 0: loc1, loc2 = name_helper.get_location_from_company_name(company_name) if loc1 is not None: result = parser_mongo_util.get_location(loc1) if result != None: locationId = result["locationId"] logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info("融资需求: %s" % str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: %s" % field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: %s" % sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace( " ", ",") logger.info("tags: %s" % tags) desc = d("div.des").text().strip() logger.info("desc: %s" % desc) #logo logo = d("div.pic >img").attr("src") #if logo: # logo = logo.replace("http://", "https://") logger.info("logo: %s", logo) website = d('div.link-line> a.weblink').attr("href").strip() if website == "http://%e6%9a%82%e6%97%a0": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) artifacts = [] type, app_market, app_id = url_helper.get_market(website) if type == 4010: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type": 4010, "name": product_name, "desc": desc, "link": website, "domain": domain }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type": 4040, "name": product_name, "desc": desc, "link": website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_mongo_util.find_android_market( app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type": 4050, "name": product_name, "desc": desc, "link": website, "domain": domain }) #获投状态 roundStr = d('span.t-small.c-green').text().replace("(", "").replace( ")", "").replace("获投状态:", "").strip() fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("获投状态: %d, %s", fundingRound, roundStr) logger.info("") return { "name": product_name, "shortName": company_short_name, "fullName": company_name, "productName": product_name, "description": desc, "brief": "", "round": fundingRound, "roundDesc": roundStr, "companyStatus": company_status, "fundingType": funding_type, "locationId": locationId, "establishDate": establish_date, "logo": logo, "source": SOURCE, "sourceId": company_key, "field": field, "subField": sub_field, "tags": tags, "type": 41010, "artifacts": artifacts }