def insert_funding(sid, roundstr, inv, fundingDate, investor): try: inv = "".join(inv.split()) if inv in ["超千万人民币", "千万人民币", "近千万人民币", "过千万人民币", "上千万人民币", "1千万人民币"]: inv = "1000万人民币" elif inv in [ "超亿人民币", "近亿人民币", "过亿人民币", "上亿人民币", "亿人民币", "一亿人民币", "亿人民币及以上人民币" ]: inv = "1亿人民币" elif inv in ["超千万美元", "千万美元", "近千万美元", "过千万美元", "上千万美元", "1千万美元"]: inv = "1000万美元" elif inv in ["百万美元", "近百万美元", "过百万美元", "上百万美元", "1百万美元"]: inv = "100万美元" elif inv in ["百万人民币", "近百万人民币", "过百万人民币", "上百万人民币", "1百万人民币"]: inv = "100万人民币" if roundstr == "re-A轮": roundstr = "Pre-A" elif roundstr == "re-IPO": roundstr = "Pre-IPO" fundingRound, roundStr = itjuzi_helper.getFundingRound( unicode(roundstr)) currency, investment, precise = itjuzi_helper.getMoney(unicode(inv)) source_funding = { "sourceCompanyId": sid, "preMoney": None, "postMoney": None, "investment": investment, "precise": precise, "round": fundingRound, "roundDesc": roundStr, "currency": currency, "fundingDate": fundingDate, "newsUrl": None } source_investors = [] source_investor = { "name": investor, "website": None, "description": None, "logo_url": None, "stage": None, "field": None, "type": 10020, "source": 13100, "sourceId": util.md5str(investor) } source_investors.append(source_investor) parser_db_util.save_funding_standard(source_funding, download_crawler, source_investors) # logger.info("%s/%s-------%s/%s/%s/%s", roundstr, inv, fundingRound, investment,precise,currency) except: logger.info("%s/%s/%s/%s", roundstr, inv, fdate, investor) # exit() pass
def parse_base(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> b').clone().children().remove( ).end().text().strip() temps = product_name.split("/", 1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: %s" % product_name) logger.info("company short name: %s" % company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace( "公司全称:", "") if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = util.norm_company_name(company_name) logger.info("company name: %s" % company_name) if company_short_name == "" and company_name == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:", "") result = util.re_get_result('(\d*?).(\d*?)$', str) if result != None: (year, month) = result try: if int(month) > 12: month = "1" except: month = "1" establish_date = datetime.datetime.strptime("%s-%s-1" % (year, month), '%Y-%m-%d') logger.info("establish date: %s" % establish_date) locationId = 0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$', str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 conn = db.connect_torndb() result = conn.get("select * from location where locationName=%s", city) if result != None: locationId = result["locationId"] else: result = conn.get("select * from location where locationName=%s", province) if result != None: locationId = result["locationId"] conn.close() logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info("融资需求: %s" % str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: %s" % field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: %s" % sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace( " ", ",") logger.info("tags: %s" % tags) desc = d("div.des").text().strip() logger.info("desc: %s" % desc) #logo logo = d("div.pic >img").attr("src") logger.info("logo: %s", logo) website = d('div.link-line> a').attr("href").strip() if website == "http://%e6%9a%82%e6%97%a0": website = "" website = util.norm_url(website) logger.info("website: %s" % website) artifacts = [{ "type": 4010, "name": product_name, "desc": desc, "link": website }] #获投状态 roundStr = d('span.t-small.c-green').text().replace("(", "").replace( ")", "").replace("获投状态:", "").strip() fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("获投状态: %d, %s", fundingRound, roundStr) logger.info("") return { "shortName": company_short_name, "fullName": company_name, "productName": product_name, "description": desc, "brief": "", "round": 0, "roundDesc": "", "companyStatus": company_status, "fundingType": funding_type, "locationId": locationId, "establishDate": establish_date, "logo": logo, "sourceId": company_key, "field": field, "subField": sub_field, "tags": tags, "artifacts": artifacts }
def abnormal(x): if pd.isnull(x.companyId): return '匹配不到公司' if pd.isnull(x.country) and pd.notnull(x.fundingDate): collection = mongo.raw.qmp_rz_parser if collection.find_one({'product': x.xiniuName }) is None and collection.find_one( {'company': x.xiniufullName}) is None: return '烯牛独家' else: return '其它' thirdTime = x[u'time'].strip() # if len(thirdTime) < 5: return '融资时间不对' try: thirdTime = datetime.datetime.strptime(thirdTime, '%Y.%m.%d') except: try: thirdTime = datetime.datetime.strptime(thirdTime, '%Y.%m') except: try: thirdTime = datetime.datetime.strptime( x[u'orderbyrztime'].strip(), '%Y%m%d') except: return '融资时间不对' # if x.fundingDate.year != thirdTime.year or x.fundingDate.month != thirdTime.month: return '融资时间不匹配' import re reg = re.findall(u'\d+[万亿]', x[u'money']) if len(reg) == 0: source_investment = None else: amout = reg[0][:-1] source_investment = float(amout) * 10000 if u'万' in x[ u'money'] else float(amout) * 10000 * 10000 roundstr = x[u'jieduan'] if roundstr == "re-A轮": roundstr = "Pre-A" elif roundstr == "re-IPO": roundstr = "Pre-IPO" fundingRound, roundStr = itjuzi_helper.getFundingRound(unicode(roundstr)) if fundingRound == 1011: fundingRound = 1010 # print thirdTime,source_investment source_funding = { 'id': 1, 'fundingDate': thirdTime, 'investment': source_investment, 'round': fundingRound } if pd.isnull(x['corporateId']): print x if compare_select( source_funding, conn.query( 'select * from funding where corporateId=%s and (active="Y" or active is null)', x['corporateId'])) is False: xiniufunding = conn.get( 'select * from funding where corporateId=%s and (active="Y" or active is null) order by round desc limit 1', x['corporateId']) xiniuRound = 0 if xiniufunding is not None: xiniuRound = xiniufunding['round'] else: return '烯牛无融资' if fundingRound > 0 and fundingRound > xiniuRound: return '企名片轮次靠后' return '烯牛轮次靠后' return '都匹配'
def parse(item): if item is None: return None company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) logger.info("*** funding ***") str = d("a.name").attr("href") if str is None: return -1 company_key = str.strip().split("/")[-1] logger.info("company_key: %s", company_key) conn = db.connect_torndb() source_company = conn.get( "select * from source_company where source=%s and sourceId=%s", SOURCE, company_key) conn.close() if source_company is None: logger.info("this source company doesn't exist yet") return None else: source_company_id = source_company["id"] logger.info("sourceComapnyId: %s", source_company_id) dateStr = d( 'div.block> div.titlebar-center> p> span.date').text().strip() result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr) fundingDate = None if result != None: (year, month, day) = result y = int(year) if y >= 2100 and y <= 2109: year = 2010 + y % 10 m = int(month) if m > 12: m = 12 month = "12" if (m == 4 or m == 6 or m == 9 or m == 11) and int(day) > 30: day = "30" elif itjuzi_helper.isRunnian( int(year)) and m == 2 and int(day) > 29: day = 29 elif itjuzi_helper.isRunnian( int(year)) == False and m == 2 and int(day) > 28: day = 28 elif int(day) > 31: day = 31 fundingDate = datetime.datetime.strptime( "%s-%s-%s" % (year, month, day), '%Y-%m-%d') logger.info(fundingDate) roundStr = d('span.round').text().strip() fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = d('span.fina').text().strip() (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) investors = [] fs = d('h4.person-name> b >a.title') for f in fs: l = pq(f) investor_name = l.text().strip() if investor_name == "": continue investor_url = l.attr("href") if investor_url is not None and investor_url != "": investor_key = investor_url.strip().split("/")[-1] investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) else: investor_key = None temps = investor_name.split(";") for name in temps: name = name.strip() if name == "": continue investor = { "name": name, "key": None, "url": None, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, name, investor_url) return { "sourceCompanyId": source_company_id, "fundingDate": fundingDate, "fundingRound": fundingRound, "roundStr": roundStr, "currency": currency, "investment": investment, "precise": precise, "investors": investors } fundings = [] # 并购信息 lis = d('table.list-round> tr') for li in lis: l = pq(li) dateStr = l('td:eq(2)').text().strip() result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr) fundingDate = None if result != None: (year, month, day) = result fundingDate = datetime.datetime.strptime( "%s-%s-%s" % (year, month, day), '%Y-%m-%d') logger.info(fundingDate) roundStr = l('td.base> a> span').text().strip() fundingRound, roundStr = getFundingRound(roundStr) logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = l( 'td.base> a').clone().children().remove().end().text().strip() (currency, investment, precise) = getMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) funding = { "fundingDate": fundingDate, "fundingRound": fundingRound, "roundStr": roundStr, "currency": currency, "investment": investment, "precise": precise } investors = [] hs = l('td.investor> a') for h in hs: h = pq(h) investor_name = h.text().strip() if investor_name == u"并购方未透露" or investor_name == u"未透露" or investor_name == "": continue investor_url = h.attr("href").strip() if investor_url is not None and investor_url != "": (investor_key, ) = util.re_get_result( r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url) else: investor_key = None logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) funding["investors"] = investors fundings.append(funding) # funding lis = d('table.list-round-v2> tr') for li in lis: l = pq(li) dateStr = l('td> span.date').text().strip() result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$', dateStr) fundingDate = None if result != None: (year, month, day) = result fundingDate = datetime.datetime.strptime( "%s-%s-%s" % (year, month, day), '%Y-%m-%d') logger.info(fundingDate) roundStr = l('td.mobile-none> span.round> a').text().strip() fundingRound, roundStr = getFundingRound(roundStr) logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = l('td> span.finades> a').text().strip() (currency, investment, precise) = getMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) funding = { "fundingDate": fundingDate, "fundingRound": fundingRound, "roundStr": roundStr, "currency": currency, "investment": investment, "precise": precise } investors = [] hs = l('td:eq(3)> a') for h in hs: h = pq(h) investor_name = h.text().strip() investor_url = h.attr("href").strip() (investor_key, ) = util.re_get_result( r"http://www.itjuzi.com/investfirm/(\d*)$", investor_url) logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) hs = l('td:eq(3)> span') for h in hs: h = pq(h) investor_name = h.text().strip() if investor_name == u"投资方未透露" or investor_name == "": continue investor_url = None investor_key = None logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) funding["investors"] = investors fundings.append(funding) logger.info("") return fundings
name_helper.company_name_normalize(unicode(fn))) fullName = name_helper.company_name_normalize(unicode(fullName)) roundstr = names[4] inv = names[5] fdate = names[6] investors = [] if names[7] is not None and names[7].strip() != "": investors.extend(names[7].split("/")) if names[8] is not None and names[8].strip() != "": investors.extend(names[8].split("/")) if len(investors) == 0: continue fundingRound, roundStr = itjuzi_helper.getFundingRound( unicode(roundstr)) if fullName not in namesa: namesa.append(fullName) if fundingRound is not None and fundingRound > 0: if cs.has_key(fullName) is False: cs[fullName] = {fundingRound: [investors]} else: if cs[fullName].has_key(fundingRound) is False: cs[fullName][fundingRound] = [investors] else: cs[fullName][fundingRound].append(investors) # logger.info(json.dumps(cs, ensure_ascii=False, cls=util.CJsonEncoder)) logger.info(len(cs)) logger.info(len(namesa))
def parse(item): if item is None: return None funding_key = item["key"] logger.info("funding_key: %s", funding_key) data = item["content"] logger.info("*** funding ***") company_key = data["com_id"] logger.info("company_key: %s", company_key) source_company = parser_db_util.get_company(13030, company_key) if source_company is None: logger.info("this source company doesn't exist yet") if int(company_key) not in nokeys: nokeys.append(int(company_key)) return None else: source_company_id = source_company["id"] logger.info("sourceComapnyId: %s", source_company_id) fundingDate = datetime.datetime.strptime(data["date"], '%Y-%m-%d') logger.info(fundingDate) roundStr = data["round"] fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = data["money"] + data["currency"] (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) investors = [] if data.has_key("invsest_with") and isinstance(data["invsest_with"], dict): for fi in data["invsest_with"]: f = data["invsest_with"][fi] investor_name = f["invst_name"] if investor_name == "" or investor_name == "未透露": continue investor_url = None if investor_url is not None and investor_url != "": investor_key = investor_url.strip().split("/")[-1] investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) else: investor_key = None temps = investor_name.split(";") for name in temps: name = name.strip() if name == "": continue investor = { "name": name, "key": None, "url": None, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, name, investor_url) return { "sourceCompanyId": source_company_id, "fundingDate": fundingDate, "fundingRound": fundingRound, "roundStr": roundStr, "currency": currency, "investment": investment, "precise": precise, "investors": investors }
def parseFinance_save(source_company_id, item, sourceId, download_crawler): logger.info("parseFinance_save") if item is None: return None d = pq(html.fromstring(item['content'].decode("utf-8"))) finances = d('.funding-info tbody tr') for finance in finances: roundStr = d(finance)('td:nth-child(1)').text() fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) fundingInvestment = d(finance)('.amount').text() if fundingInvestment.find('¥ ') >= 0: fundingInvestment = fundingInvestment.replace('¥ ', '') + '人民币' elif fundingInvestment.find('$ ') >= 0: fundingInvestment = fundingInvestment.replace('$ ', '') + '美元' else: logger.info('not RMB:%s %s', sourceId, fundingInvestment) # todo exit() fundingCurrency, fundingInvestment, precise = itjuzi_helper.getMoney( fundingInvestment) fundingDate = datetime.datetime.strptime( d(finance)('.date').text(), '%Y-%m-%d') source_funding = { "sourceCompanyId": source_company_id, "preMoney": None, "postMoney": None, "investment": fundingInvestment, "precise": precise, "round": fundingRound, "roundDesc": roundStr, "currency": fundingCurrency, "fundingDate": fundingDate, "newsUrl": None } # logger.info(json.dumps(source_funding, ensure_ascii=False, cls=util.CJsonEncoder)) if fundingInvestment == 0: logger.info("new invest case: %s", sourceId) exit() logger.info("%s, %s, %s, %s", roundStr, fundingRound, fundingInvestment, fundingCurrency) source_investors = [] investors = d(finance)('.investor a') for investor in investors: entityName = d(investor).text().strip() logger.info(entityName) entityId = str(d(investor).attr('href').split('startups/')[-1]) source_investor = { "name": entityName, "website": None, "description": None, "logo_url": None, "stage": None, "field": None, "type": 10020, "source": SOURCE, "sourceId": entityId } source_investors.append(source_investor) logger.info( json.dumps(source_investors, ensure_ascii=False, cls=util.CJsonEncoder)) try: parser_db_util.save_funding_standard(source_funding, download_crawler, source_investors) except: pass
def parse(item): if item is None: return None funding_key = item["key"] logger.info("funding_key: %s", funding_key) html = item["content"] #logger.info(html) d = pq(html) logger.info("*** funding ***") str = d("a.name").attr("href") if str is None: return -1 company_key = str.strip().split("/")[-1] logger.info("company_key: %s", company_key) source_company = parser_db_util.get_company(SOURCE, company_key) if source_company is None: logger.info("this source company doesn't exist yet") if int(company_key) not in nokeys: nokeys.append(int(company_key)) return None else: source_company_id = source_company["id"] logger.info("sourceComapnyId: %s", source_company_id) dateStr = d('div.title> h1> span').text().strip() result = util.re_get_result('(\d*?)\.(\d*?)\.(\d*?)$',dateStr) fundingDate = None if result != None: (year, month, day) = result y = int(year) if y >= 2100 and y <= 2109: year = 2010 + y%10 m = int(month) if m > 12: m = 12 month = "12" if (m==4 or m==6 or m==9 or m==11) and int(day)>30: day = "30" elif itjuzi_helper.isRunnian(int(year)) and m==2 and int(day)>29: day = 29 elif itjuzi_helper.isRunnian(int(year)) == False and m==2 and int(day)>28: day = 28 elif int(day) > 31: day = 31 fundingDate = datetime.datetime.strptime("%s-%s-%s" % (year,month,day), '%Y-%m-%d') logger.info(fundingDate) roundStr = d('div.block-inc-fina> table> tbody> tr> td> span.round').text().strip() fundingRound, roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("fundingRound=%d, roundStr=%s", fundingRound, roundStr) moneyStr = d('div.block-inc-fina> table> tbody> tr> td> span.fina').text().strip() (currency, investment, precise) = itjuzi_helper.getMoney(moneyStr) logger.info("%s - %s - %s" % (currency, investment, precise)) investors = [] # fs = d('div.right> h4 >a.title') # for f in fs: # l = pq(f) # investor_name = l.text().strip() # if investor_name == "": # continue # investor_url = l.attr("href") # if investor_url is not None and investor_url != "": # investor_key = investor_url.strip().split("/")[-1] # investor = { # "name":investor_name, # "key":investor_key, # "url":investor_url, # "type":38001 # } # investors.append(investor) # logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) # else: # investor_key = None # temps = investor_name.split(";") # for name in temps: # name = name.strip() # if name == "": # continue # investor = { # "name":name, # "key":None, # "url":None, # "type":38001 # } # investors.append(investor) # logger.info("Investor: %s, %s, %s", investor_key, name, investor_url) fs = pq(d('div.pad.finan-history> table >tr> td').eq(2))('span> a') for f in fs: l = pq(f) investor_name = l.text().strip() if investor_name == "": continue investor_url = l.attr("href") if investor_url is not None and investor_url != "": investor_key = investor_url.strip().split("/")[-1] investor = { "name": investor_name, "key": investor_key, "url": investor_url, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, investor_name, investor_url) else: investor_key = None temps = investor_name.split(";") for name in temps: name = name.strip() if name == "": continue investor = { "name": name, "key": None, "url": None, "type": 38001 } investors.append(investor) logger.info("Investor: %s, %s, %s", investor_key, name, investor_url) return { "sourceCompanyId":source_company_id, "fundingDate":fundingDate, "fundingRound":fundingRound, "roundStr":roundStr, "currency":currency, "investment":investment, "precise":precise, "investors":investors }
def parse_base(item): if item is None: return None logger.info("*** base ***") company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) company_short_name = "" product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip() if product_name is None or product_name.strip() == "": product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip() temps = product_name.split("/",1) if len(temps) == 2: product_name = temps[0].strip() company_short_name = temps[1].strip() if company_short_name == "": company_short_name = product_name logger.info("product name: %s" % product_name) logger.info("company short name: %s" % company_short_name) company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","") if company_name == "暂无" or company_name == "暂未收录": company_name = "" if company_name is None or company_name.strip() == "": try: company_name = d('div.des-more> h2').text().strip() except: pass if company_name == "暂无" or company_name == "暂未收录": company_name = "" company_name = name_helper.company_name_normalize(company_name) logger.info("company name: %s" % company_name) if company_short_name == "" and company_name == "": return establish_date = None str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","") result = util.re_get_result('(\d*)\.(\d*)',str) if result != None: (year, month) = result try: if int(month) > 12: month = "1" except: month = "1" establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d') logger.info("establish date: %s" % establish_date) locationId=0 str = d('span.loca').text().strip() #logger.info(str) result = util.re_get_result(u'(.*?)·(.*?)$',str) if result != None: (province, city) = result province = province.strip() city = city.strip() logger.info("location: %s-%s" % (province, city)) locationId = 0 result = parser_db_util.get_location(city) if result != None: locationId = result["locationId"] else: result = parser_db_util.get_location(province) if result != None: locationId = result["locationId"] if locationId == 0: loc1,loc2 = name_helper.get_location_from_company_name(company_name) if loc1 is not None: result = parser_db_util.get_location(loc1) if result != None: locationId = result["locationId"] logger.info("locationId: %d" % locationId) company_status = 2010 str = d('div.des-more> div').eq(2).text().strip() if str == "已关闭": company_status = 2020 logger.info("company_status: %d" % company_status) funding_type = 0 str = d("span.tag.bg-c").text().strip() logger.info("融资需求: %s" % str) if str == "融资需求 · 需要融资": funding_type = 8020 elif str == "融资需求 · 寻求收购": funding_type = 8020 logger.info("funding_type=%d" % funding_type) try: brief = d("h2.seo-slogan").text().strip() except: brief = "" logger.info("brief: %s" % brief) if brief.find("暂未收录"): brief = "" field = d("span.scope.c-gray-aset> a").eq(0).text().strip() logger.info("field: %s" % field) sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip() logger.info("sub field: %s" % sub_field) tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",") logger.info("tags: %s" % tags) desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\ replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip() logger.info("********desc: %s" % desc) #logo logo = d("div.pic >img").attr("src") #if logo: # logo = logo.replace("http://", "https://") logger.info("logo: %s", logo) # website = d('div.link-line> a').text().strip() # if website is None or website == "": # website = d('div.link-line> a.webTink').text().strip() # if website is None or website == "": # try: # logger.info("here") # website = d('div.link-line> span.weblink> a').eq(1).text().strip() # logger.info(website) # except: # pass artifacts = [] for ty in [1,2,3]: if ty == 1: was = d('div.link-line> a') else: was = d('div.link-line> span.weblink,span.webTink> a') for wa in was: webs =[] try: website = pq(wa).attr("href").strip() if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass try: website = pq(wa).text().strip() if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt": website = "" website = url_helper.url_normalize(website) logger.info("website: %s" % website) webs.append(website) # else: # website = pq(wa).text().strip() except: pass # # if website=="http://%e6%9a%82%e6%97%a0": # website = "" # website = url_helper.url_normalize(website) # logger.info("website: %s" % website) # artifacts = [] for website in webs: type, app_market, app_id = url_helper.get_market(website) if type == 4010: flag, domain = url_helper.get_domain(website) if flag is not None: if flag is False: domain = None artifacts.append({ "type":4010, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4020: domain = app_id if domain is not None: artifacts.append({ "type": 4020, "name": product_name, "desc": None, "link": website, "domain": website }) elif type == 4030: domain = app_id if domain is not None: artifacts.append({ "type": 4030, "name": product_name, "desc": None, "link": website, "domain": None }) elif type == 4040: domain = app_id if domain is not None: artifacts.append({ "type":4040, "name":product_name, "desc":desc, "link":website, "domain": domain }) elif type == 4050: domain = None if app_market == 16010 or app_market == 16020: android_app = parser_db_util.find_android_market(app_market, app_id) if android_app: domain = android_app["apkname"] else: domain = app_id if domain is not None: artifacts.append({ "type":4050, "name":product_name, "desc":desc, "link":website, "domain": domain }) #获投状态 roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip() fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr) logger.info("获投状态: %d, %s", fundingRound, roundStr) logger.info("") return { "shortName": company_short_name, "fullName": company_name if company_name is not None and company_name.strip() != "" else None, "productName": product_name, "description": desc, "brief": brief, "round": fundingRound, "roundDesc": roundStr, "companyStatus": company_status, "fundingType": funding_type, "locationId": locationId, "establishDate": establish_date, "logo": logo, "sourceId": company_key, "field": field, "subField": sub_field, "tags": tags, "type":41010, "artifacts":artifacts }