def crawler(sql): db = get_db_engine() items = list(db.execute(sql)) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() for item in items: shop_id = item[0] shop_type = item[1] item_id = item[2] url = item[3] try: htm = get_item_htm(item_id, url, db) if shop_type == 1: htm_obj = parse_html(htm, encoding='gb18030') discount_url = htm_obj.xpath("//div[@id='promote']/@data-default") if discount_url and len(discount_url) > 0: item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} disc_content = download(discount_url[0], item_headers) if disc_content.strip(): disc_obj = parse_html(disc_content, encoding='gb18030') content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip() dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip() st = dates.encode('utf-8').replace("--","—").split("—") start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d') end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d') db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())", shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0]) logger.info("taobao shop %s get discount success", shop_id) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) elif shop_type == 2: d_url = get_val(htm, "initApi") if d_url: item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} disc_content = download(d_url, item_headers) cjson = loads(disc_content.decode('gb18030').encode('utf8')) shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm'] if shop_prom: st = int(shop_prom['startTime'])/1000 et = int(shop_prom['endTime'])/1000 start_time = time.strftime("%Y-%m-%d", time.localtime(st)) end_time = time.strftime("%Y-%m-%d", time.localtime(et)) content = shop_prom['promPlan'][0]['msg'] db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())", shop_id, content.encode('utf-8'), start_time, end_time, d_url) logger.info("tmall shop %s get discount success", shop_id) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) except: logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())
def crawl_one_shop(shop, db): shop_id = shop[0] shop_type = shop[1] shop_url = shop[2] first_item_id = shop[3] item_html_id = shop[4] item_html = shop[5] urls = {'shop_url': shop_url} try: the_shop = ShopExtendInfo(db, shop_id) result = False logger.info("begin get shop extend info. shop id: %d. shop type: %d." % (shop_id, shop_type)) logger.debug("first item id: %d. item html id: %d. html length: %d" % (first_item_id, item_html_id, len(item_html))) url = url_reg.search(item_html).group(1).encode('utf-8') urls['shop_rate_url'] = url shop_html = download_with_referer(url, shop_url) if shop_html: logger.debug( "download shop extend html. shop_id: %d, item_id: %d, url: %s. length: %d" % (shop_id, first_item_id, url, len(shop_html))) shop_html_obj = parse_html(shop_html, 'gbk') item_html_obj = parse_html(item_html, 'gbk') if shop_type == 1: get_taobao_shop_extend_info(the_shop, shop_html_obj, item_html_obj, urls) else: get_tmall_shop_extend_info(the_shop, shop_html_obj, item_html_obj, urls) the_shop.save() result = True else: logger.error( "download shop extend html error. shop_id: %d, item_id: %d, url: %s." % (shop_id, first_item_id, url)) if result: logger.info( "success get shop extend info. shop_id: %d. type: %d." % (shop_id, shop_type)) else: logger.error("fail get shop extend info. shop_id: %d. type: %d." % (shop_id, shop_type)) except: logger.error( "update_shop_extend_info failed. shop_id: %s. type: %d, error info: %s" % (shop_id, shop_type, traceback.format_exc()))
def get_tmall_shop_collected_count(the_shop, shop_html_obj, item_html_obj, urls): """获取天猫店被关注数目""" try: is_done = False if not is_done: collected_count = shop_html_obj.xpath(u"//em[@class='j_CollectBrandNum']/text()") if collected_count and collected_count[0].isdigit(): the_shop.favorited_user_count = int(collected_count[0]) is_done = True if not is_done: shop_home_html = download_with_referer(urls['shop_url'], None) shop_home_obj = parse_html(shop_home_html, 'gbk') collected_count = shop_home_obj.xpath(u"//em[@class='j_CollectBrandNum']/text()") if collected_count and collected_count[0].isdigit(): the_shop.favorited_user_count = int(collected_count[0]) is_done = True if not is_done: collected_count = item_html_obj.xpath(u"//em[@class='j_CollectBrandNum']/text()") if collected_count and collected_count[0].isdigit(): the_shop.favorited_user_count = int(collected_count[0]) is_done = True if not is_done: logger.error("get shop collected count failed. shop_id: %d." % the_shop.get_shop_id()) except: logger.error("get shop favorite count failed. shop_id: %s. error info: %s" % (the_shop.get_shop_id(), traceback.format_exc()))
def crawl_page(self, url): retry_count = 1 while retry_count >= 0: try: data = self.download_page(url) if not data: logger.warn("crawl %s %s failed", self.id, url) return None, None, None, None if FLAGS.dump: dumpf = open("%s_%s" % (self.id, url.replace('/', '_').replace(':','_').replace('&','_').replace('?','_')), 'w') dumpf.write(data) dumpf.close() if FLAGS.debug_parser: import pdb; pdb.set_trace() if data.find(u"没有找到相应的店铺信息".encode('gbk')) > 0: logger.warn("Shop %s is offline %s", self.id, self.url) raise ShopOfflineException(data) html_obj = parse_html(data, encoding="gb18030") self.level_img = html_obj.xpath("//img[@class='rank']/@src") self.nick_url = html_obj.xpath("//a[@class='shop-name']/@href") if not self.nick_url: self.nick_url = html_obj.xpath("//div[@id='shop-info']//a/@href") result = html_obj.xpath("//div[@id='anchor']//div[@class='search-result']//span/text()") items = html_obj.xpath("//div[@id='anchor']//div[@class='item']") pages = html_obj.xpath("//div[@id='anchor']//div[@class='pagination']/a[@class='J_SearchAsync']/@href") if not result: result = ITEM_NUMBER_RE.findall(data) if result and not items: items = html_obj.xpath("//ul[@class='shop-list']//div[@class='item']") if not result: result = html_obj.xpath("//div[@id='J_ShopSearchResult']//div[@class='search-result']//span/text()") items = html_obj.xpath("//div[@id='J_ShopSearchResult']//dl[contains(@class, 'item')]") pages = html_obj.xpath("//div[@id='J_ShopSearchResult']//div[@class='pagination']/a[@class='J_SearchAsync']/@href") if not result: # pageLen = ['1/107'] pageLen = html_obj.xpath("//p[@class='ui-page-s']//b[@class='ui-page-s-len']/text()") items = html_obj.xpath("//div[@class='J_TItems']//dl[contains(@class, 'item')]") c = 0 if "/" in pageLen[0]: c = int(pageLen[0].split("/")[1].strip()) * len(items) else: c = int(pageLen[0].strip()) * len(items) result.append(str(c)) pages = html_obj.xpath("//div[@class='J_TItems']//div[@class='pagination']/a[@class='J_SearchAsync']/@href") if not result and not items and not pages: logger.warn("crawl %s %s -- 0 items found, page len %s", self.id, url, len(data)) if retry_count > 0 and len(data) < 1024: retry_count -= 1 time.sleep(1.0) continue return result, items, pages, data except ShopOfflineException, e: raise e except BannedException, e: raise e
def crawl_one_shop(shop, db): shop_id = shop[0] shop_type = shop[1] shop_url = shop[2] first_item_id = shop[3] item_html_id = shop[4] item_html = shop[5] urls = {'shop_url': shop_url} try: the_shop = ShopExtendInfo(db, shop_id) result = False logger.info("begin get shop extend info. shop id: %d. shop type: %d." % (shop_id, shop_type)) logger.debug("first item id: %d. item html id: %d. html length: %d" % (first_item_id, item_html_id, len(item_html))) url = url_reg.search(item_html).group(1).encode('utf-8') urls['shop_rate_url'] = url shop_html = download_with_referer(url, shop_url) if shop_html: logger.debug("download shop extend html. shop_id: %d, item_id: %d, url: %s. length: %d" % (shop_id, first_item_id, url, len(shop_html))) shop_html_obj = parse_html(shop_html, 'gbk') item_html_obj = parse_html(item_html, 'gbk') if shop_type == 1: get_taobao_shop_extend_info(the_shop, shop_html_obj, item_html_obj, urls) else: get_tmall_shop_extend_info(the_shop, shop_html_obj, item_html_obj, urls) the_shop.save() result = True else: logger.error("download shop extend html error. shop_id: %d, item_id: %d, url: %s." % (shop_id, first_item_id, url)) if result: logger.info("success get shop extend info. shop_id: %d. type: %d." % (shop_id, shop_type)) else: logger.error("fail get shop extend info. shop_id: %d. type: %d." % (shop_id, shop_type)) except: logger.error("update_shop_extend_info failed. shop_id: %s. type: %d, error info: %s" % (shop_id, shop_type, traceback.format_exc()))
def crawl_tao123(shops): base_url = "http://dianpu.tao123.com/nvzhuang/%s.php" end = 22 for i in range(1, end+1): url = base_url % i html = download(url) html_obj = parse_html(html) shops.update(html_obj.xpath("//div[@class='cg_shop_info']//a/@href"))
def crawl_dirtbshop(shops): base_url = "http://dirtbshop.com/list_shop_%s_1_1.html" end = 251 for i in range(1, end+1): url = base_url % i html = download(url) html_obj = parse_html(html) import pdb; pdb.set_trace() urls = html_obj.xpath("//span[@class='grebtn_in']/a/@href")
def crawl_tao123(shops): base_url = "http://www.meilishuo.com/shop/top/0/%s" end = 203 for i in range(end): logger.debug("processing %s", i) url = base_url % i html = download(url) html_obj = parse_html(html) shops.update(html_obj.xpath("//div[@class='shop_item']//a/@href"))
def crawl_tao123(shops): for line in open(FLAGS.path): try: line = line.strip() url = "http://www.meilishuo.com%s" % line html = download(url) html_obj = parse_html(html) shop_url = html_obj.xpath("//div[@class='shop_summary']/a/@href") logger.debug("processing %s -> %s", line, shop_url) shops.update(shop_url) except: logger.error("processing %s failed", line)
def crawl_title(self): try: self.data = self.crawl_page(self.url) if not self.data: logger.warn("download %s %s page failed, possible network connection failure", self.item_id, self.num_id) return # check tmall if not self.is_tmall and len(self.data) < 256 and self.url.find('item.taobao.com') > 0 and self.data.find("window.location.href='http://detail.tmall.com/item.htm'+window.location.search") > 0: self.data = self.crawl_page(self.url.replace('item.taobao.com', 'detail.tmall.com')) if self.check_offline(): self.is_offline = True self.html_obj = parse_html(self.data, encoding="gb18030") title = self.html_obj.xpath("//html/head/title/text()") if title and title[0].find(u"转卖") > 0: self.is_offline = True if title: self.title = title[0].encode('utf8').replace("-淘宝网", "").replace("-tmall.com天猫", "") #tblogo = self.html_obj.xpath("//*[@id='shop-logo']") tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']") if not tmalllogo: tmalllogo = self.html_obj.xpath("//*[@id='simple-logo']") if not self.is_tmall and tmalllogo: self.is_tmall = True self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if not len(self.thumbImages): try: # try load thumb images for tmall page self.thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")] # taobao @src to @data-src if not len(self.thumbImages): self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src") except: logger.warn("No thumbs found %s", self.item_id) if self.is_tmall: self.cid = get_val(self.data, "categoryId").split('&')[0] else: self.cid = get_val(self.data, "cid") logger.info("Got %s %s html success", self.item_id, self.num_id) except: logger.error("crawling %s %s unknown exception %s", self.item_id, self.num_id, traceback.format_exc(), extra={'tags':['crawlItemException',]}) raise
def crawl_main(): for host in open(FLAGS.path): url = "http://%s" % (host.strip()) try: html = download(url) #import pdb; pdb.set_trace() html_obj = parse_html(html, 'gbk') if url.find('tmall.com') > 0: shop_url = html_obj.xpath("//h3[@class='shop-title']/a/@href")[0] shop_name = html_obj.xpath("//h3[@class='shop-title']/a/text()")[0] print shop_url, shop_name.encode('utf8') else: shop_url = html_obj.xpath("//div[@class='shop-info-simple']/a/@href")[0] shop_name = html_obj.xpath("//div[@class='shop-info-simple']/a/text()")[0] shop_rank = html_obj.xpath("//span[@class='shop-rank']//img/@src")[0] #good_rate = html_obj.xpath("//li[@class='goodrate']/text()")[0] print shop_url, shop_name.encode('utf8'), shop_rank except KeyboardInterrupt: raise except: logger.warn("processing %s failed, %s", url, traceback.format_exc())
def get_tmall_shop_collected_count(the_shop, shop_html_obj, item_html_obj, urls): """获取天猫店被关注数目""" try: is_done = False if not is_done: collected_count = shop_html_obj.xpath( u"//em[@class='j_CollectBrandNum']/text()") if collected_count and collected_count[0].isdigit(): the_shop.favorited_user_count = int(collected_count[0]) is_done = True if not is_done: shop_home_html = download_with_referer(urls['shop_url'], None) shop_home_obj = parse_html(shop_home_html, 'gbk') collected_count = shop_home_obj.xpath( u"//em[@class='j_CollectBrandNum']/text()") if collected_count and collected_count[0].isdigit(): the_shop.favorited_user_count = int(collected_count[0]) is_done = True if not is_done: collected_count = item_html_obj.xpath( u"//em[@class='j_CollectBrandNum']/text()") if collected_count and collected_count[0].isdigit(): the_shop.favorited_user_count = int(collected_count[0]) is_done = True if not is_done: logger.error("get shop collected count failed. shop_id: %d." % the_shop.get_shop_id()) except: logger.error( "get shop favorite count failed. shop_id: %s. error info: %s" % (the_shop.get_shop_id(), traceback.format_exc()))
def crawl(): company_id = 18 url = "https://www.my089.com/Loan/default.aspx?pid=1" request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath("//div[@class='Loan_box']/dl[@class='LoanList']") if len(loans) > 0: for loan in loans: if str(loan.xpath("dd[last()]/p/span/text()")[0]) == "100%": continue href = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.my089.com/Loan/" + href loan_obj.title = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")[0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str(loan.xpath("dd[3]/span/text()")[0].encode("UTF-8")).strip().replace("%/年", "") loan_obj.period = str(loan.xpath("dd[5]/span/text()")[0].encode("UTF-8")).strip().replace(" ", "") s = str(loan.xpath("dd[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("个", "") loan_obj.period_unit = s.split("/")[0].strip() loan_obj.repayment = s.split("/")[1].strip() loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl_item2(kwargs): #signal.signal(signal.SIGINT, signal.SIG_IGN) item = kwargs['item'] is_commit = kwargs['is_commit'] crawl_path = kwargs['crawl_path'] server_path = kwargs['server_path'] org_server_path = kwargs['org_server_path'] is_remove = kwargs['is_remove'] item_id = item[0] num_id = item[1] is_success = False crawl_result = ((item_id, { 'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0 }), ) try: conn = get_db_engine(**kwargs).connect() try: items = conn.execute( "select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id) result = list(items) if len(result) == 1: html = result[0][0] desc_content = result[0][1] html_obj = parse_html(html) thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if len(thumbImages) == 0: thumbImages = [ IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style") ] # taobao @src to @data-src if not len(thumbImages): thumbImages = html_obj.xpath( "//ul[@id='J_UlThumb']//img/@data-src") if len(thumbImages) == 0: logger.error( "crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags': [ 'crawl_thumb_empty', ]}) return crawl_result r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M | re.S) tr = re.compile("(.*)_\d+x\d+\.jpg$") desc_thumbs = lazy_desc_thumbs = [] if desc_content: desc_html = r.subn(r'\2', desc_content)[0] desc_html_obj = parse_html(desc_html) if desc_html_obj is not None: desc_thumbs = desc_html_obj.xpath( "//*[not(@href)]/img[not(@data-ks-lazyload)]/@src") lazy_desc_thumbs = desc_html_obj.xpath( "//*[not(@href)]/img/@data-ks-lazyload") else: logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags': [ 'crawl_nodesc', ]}) images = [] pos = 1 for url in thumbImages: images.append((tr.sub(r'\1', url), pos, 1)) pos += 1 for url in desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 2)) pos += 1 for url in lazy_desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 3)) pos += 1 logger.debug("crawling %s %s %s", item_id, num_id, images) item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport']) item_crawler.crawl(images, ((710, 10000), ), is_commit, conn, is_remove) is_success = item_crawler.success crawl_result = ((item_id, item_crawler.summary), ) except Exception, e: logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags': [ 'crawl_exception', ]}) finally: conn.close() Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'], host=kwargs['statshost'], port=kwargs['statsport']) if is_success: logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result) Statsd.increment('guang.crawl.itemimg.succ', host=kwargs['statshost'], port=kwargs['statsport']) else: logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags': [ 'crawl_failed', ]}) Statsd.increment('guang.crawl.itemimg.failed', host=kwargs['statshost'], port=kwargs['statsport'])
def crawl_wzdai(): url = "https://www.wzdai.com/invest/index.html?status=1&page=1&order=-3" request_headers = { 'Referee': "https://www.wzdai.com", 'User-Agent': DEFAULT_UA } company_id = 3 db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) pages_obj = htm_obj.xpath( "//div[@class='page']/div[@align='center']/span/text()")[0] page = int(str(pages_obj.encode("utf-8")).split("条")[1].split("页")[0]) for p in range(1, page + 1): url = "https://www.wzdai.com/invest/index.html?status=1&page=" + str( p) + "&order=-3" loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath("//div[@class='invest_box']") if len(loans) > 0: for loan in loans: href = "https://www.wzdai.com" + str( loan.xpath("h1/a[@class='del']/@href")[0]) title = loan.xpath( "h1/a[@class='del']/text()")[0].strip().encode("UTF-8") borrow_amount = str( loan.xpath( "div[@class='invest_box_Info']/div[@class='prize']/span/b/text()" )[0]) rate = str( loan.xpath( "div[@class='invest_box_Info']/div[@class='prize']/font/b/text()" )[0]) text = loan.xpath( "div[@class='invest_box_Info']/div[@class='text']") loan_period = "" repayment = "" for lp in text: p = lxml.html.tostring(lp).strip().replace( "\r\n", "").split("<br>") html_parser = HTMLParser.HTMLParser() loan_period = html_parser.unescape(p[0].replace( '<div class="text">', "").strip()).encode("UTF-8").replace("借款期限:", "") repayment = html_parser.unescape( p[1].strip()).encode("UTF-8").replace("还款方式:", "") cast = loan.xpath("div[@class='invest_box_Info']/div[@class='text2']/text()")[0].strip()\ .encode("UTF-8").replace("已投:¥", "").replace("元","") schedule = str( loan.xpath( "div[@class='invest_box_Info']/div[@class='percent_big']/div[@class='percent_small']/font/text()" )[0]) logger.info(href, title, borrow_amount, rate, cast, schedule, loan_period, repayment) db = get_db_engine() db.execute( "insert into loan (company_id,url,title,borrow_amount,rate,loan_period," "repayment,cast,schedule,crawl_status,status,create_time,update_time) " "values (1,%s,%s,%s,%s,%s,%s,%s,%s,0,0,now(),now())", href, title, borrow_amount, rate, loan_period, repayment, cast, schedule) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 26 url = "http://www.htyd50.com/trade/borrow/bidding.htm" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath( "//div[@class='page_block']/div[@class='page_block_content']/div[@class='min_height_300 mb_30']/div[@class='w980 clearfix']" ) print len(loans) if len(loans) > 0: for loan in loans: href = str( loan.xpath("div[2]/div[1]/div[1]/a/@href")[0].encode( "utf-8")) original_id = href.replace(".html", "").split("/")[5].strip() print href, original_id # if original_id: # online_ids_set.add(original_id) # # if original_id in db_ids_set: # update_ids_set.add(original_id) # # loan_obj = Loan(company_id, original_id) # if loan.xpath("td[7]/div/a"): # loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") # else: # loan_obj.schedule = "0" # loan_obj.db_update(db) # else: # new_ids_set.add(original_id) # # loan_obj = Loan(company_id, original_id) # loan_obj.href = "https://www.xinhehui.com" + href # title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip() # if loan.xpath("td[1]/p[1]/a/em"): # title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip() # else: # title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip() # loan_obj.title = title_1 + title_2 # borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "") # if borrow_amount.find("万") > 0: # loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000 # else: # loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", "")) # # if loan.xpath("td[4]/span"): # period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip() # else: # period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip() # if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: # loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") # loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY # else: # loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") # loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH # # loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "") # loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip() # if loan.xpath("td[7]/div/a"): # loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") # else: # loan_obj.schedule = "0" # # loan_obj.db_create(db) # #logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # ## db - 新抓取的 = 就是要下线的 #off_ids_set = db_ids_set - online_ids_set #if off_ids_set: # loan_obj = Loan(company_id) # loan_obj.db_offline(db, off_ids_set) # logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 20 url = "http://www.xiaomabank.com/finance.do" request_headers = {"Referee": "http://www.xiaomabank.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr") if len(loans) > 0: for loan in loans: href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = ( str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")) .replace("width:", "") .strip() .replace("%;", "") ) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.xiaomabank.com/" + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = ( str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "") ) loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = ( str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")) .replace("width:", "") .strip() .replace("%;", "") ) # 注意这里页面返回的gzip压缩后的,需要解压 resp = urllib2.urlopen(loan_obj.href) respInfo = resp.info() if ("Content-Encoding" in respInfo) and (respInfo["Content-Encoding"] == "gzip"): respHtml = zlib.decompress(resp.read(), 16 + zlib.MAX_WBITS) info_htm_parse = parse_html(respHtml, encoding="utf-8") loan_obj.repayment = str( info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8") ) loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 8 url = "http://www.eloancn.com/new/loadAllTender.action?page=3&sidx=progress&sord=desc" request_headers = {'Referee': "http://www.eloancn.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: for p in range(1, 4): url = "http://www.eloancn.com/new/loadAllTender.action?page=%s" % p logger.info("page url:%s", url) # 这个页面比较恶心,一个标的的属性不在一个div内 loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") htm_1 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd300 pdl10 fl']") htm_2 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd140 fl']") htm_3 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd130 fl pdl10']") htm_4 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd130 fl']") loan_list = [] for h1 in htm_1: loan_obj = Loan(company_id) loan_obj.title = str(h1.xpath("h3/a[@class='fl']/text()")[0].encode("utf-8")) loan_obj.href = str(h1.xpath("h3/a[@class='fl']/@href")[0]).replace(":80", "") loan_obj.original_id = loan_obj.href.split("=")[1] loan_list.append(loan_obj) for index, h2 in enumerate(htm_2): loan_list[index].borrow_amount = str(h2.xpath("p[@class='colorCb mt10']/text()")[0].encode("utf-8")).replace("¥","").replace(",","") loan_list[index].rate = str(h2.xpath("p[@class='colorE6']/span/text()")[0]).replace("%", "") for index, h3 in enumerate(htm_3): loan_list[index].period = str(h3.xpath("p/span/text()")[0].encode("utf-8")) loan_list[index].period_unit = loan_obj.PERIOD_UNIT_MONTH loan_list[index].repayment = str(h3.xpath("p[@class='']/text()")[0].encode("utf-8")) for index, h4 in enumerate(htm_4): loan_list[index].schedule = str(h4.xpath("p/span/em/text()")[0]).strip().replace("%", "") # 去掉已经满标的 new_list = [i for i in loan_list if i.schedule != "100"] for loan in new_list: online_ids_set.add(loan.original_id) if loan.original_id in db_ids_set: update_ids_set.add(loan.original_id) loan.db_update(db) else: new_ids_set.add(loan.original_id) loan.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) time.sleep(5) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj_off = Loan(company_id) loan_obj_off.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 13 url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress=" request_headers = {'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='bidList']/li") if len(loans) > 0: for loan in loans: if not loan.xpath("div[2]/div[2]/div/div[@class='bid_empty_errortip']"): continue href = str(loan.xpath("div[2]/div/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.yirendai.com" + href loan_obj.title = str(loan.xpath("div[2]/div/h3/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.rate = str(loan.xpath("div[2]/div/div[3]/h4/span/text()")[0].encode("utf-8")).strip() loan_obj.period = str(loan.xpath("div[2]/div/div[4]/h4/span/text()")[0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 3 url = "http://www.91wangcai.com/invest/index.html" request_headers = { 'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="gb2312") loans = loan_htm_parse.xpath("//div[@class='proBoxNew']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='hd']/a/@href")[0]) original_id = href.split(".")[0].split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.91wangcai.com" + href loan_obj.title = autodecode( str( loan.xpath("div[@class='hd']/a/text()")[0].encode( "gb2312"))).encode("utf-8") loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("¥", "") loan_obj.rate = str( loan.xpath( "div[@class='bd']/table/tr[1]/td[2]/em/text()") [0]).strip().replace("%", "") loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \ .replace("<em>", "").replace("</em>", "") html_parser = HTMLParser.HTMLParser() period = html_parser.unescape(loan_period_text).encode( "utf-8").strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("还款方式:", "") loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def get_taobao_shop_favorite_count(the_shop, shop_html_obj, urls): """获取淘宝店被收藏数目""" try: favorite_count_success = False favorite_param = shop_html_obj.xpath(u"//div[@class='item collect-num']/span[contains(@data-info,'SCCP')]/@data-info") if favorite_param: the_param = favorite_param[0].split('&') favorite_url = "%s?callback=jsonp%d&t=%s&keys=%s" % ( the_param[1].split('=')[1], random.randint(1000, 9999), str(int(time.time() * 1000)), the_param[0].split('=')[1] ) urls['favorite_url'] = favorite_url favorite_html = download_with_referer(favorite_url, urls['shop_rate_url']) if favorite_html: logger.debug("download shop favorite html. shop_id: %d, url: %s. html length: %d." % ( the_shop.get_shop_id(), favorite_url, len(favorite_html)) ) the_shop.favorited_user_count = int(favorite_num_reg.search(favorite_html).group(1)) favorite_count_success = True else: logger.error( "download shop favorite html error. shop_id: %d, url: %s." % (the_shop.get_shop_id(), favorite_url) ) if not favorite_count_success: logger.debug("use pattern left edition to get favorite data ") favorite_param = shop_html_obj.xpath(u"//li[@id='J_SCollCount'][@data-info]/@data-info") if favorite_param: the_param = favorite_param[0].split('&') favorite_url = "%s?t=%s&keys=%s&callback=setShopStat" % ( the_param[1].split('=')[1], str(int(time.time() * 1000)), the_param[0].split('=')[1] ) favorite_html = download_with_referer(favorite_url, urls['shop_rate_url']) if favorite_html: the_shop.favorited_user_count = int(favorite_num_reg.search(favorite_html).group(1)) favorite_count_success = True if not favorite_count_success: logger.debug("use pattern for old edition to get favorite data ") shop_description_url = shop_html_obj.xpath(u"//a[@title='店铺介绍']/@href") if shop_description_url: shop_description_html = download_with_referer(shop_description_url[0], urls['shop_rate_url']) if shop_description_html: shop_description_html_obj = parse_html(shop_description_html, 'gbk') favorite_param = shop_description_html_obj.xpath(u"//li[@id='J_SCollCount'][@data-info]/@data-info") if favorite_param: the_param = favorite_param[0].split('&') favorite_url = "%s?t=%s&keys=%s&callback=setShopStat" % ( the_param[1].split('=')[1], str(int(time.time() * 1000)), the_param[0].split('=')[1] ) favorite_html = download_with_referer(favorite_url, shop_description_url) if favorite_html: the_shop.favorited_user_count = int(favorite_num_reg.search(favorite_html).group(1)) favorite_count_success = True if not favorite_count_success: logger.error("get shop favorite count failed. shop_id: %d." % the_shop.get_shop_id()) except: logger.error("get shop favorite count failed. shop_id: %s. error info: %s" % (the_shop.get_shop_id(), traceback.format_exc()))
def crawler(sql): db = get_db_engine() shops = list(db.execute(sql)) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() for shop in shops: shop_id = shop[0] url = str(shop[1]) type = shop[2] if url[-1] != '/': url += "/" try: shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} dongtai_url = url + "dongtai.htm" dongtai_data = download(dongtai_url, shop_headers) if dongtai_data: dongtai_obj = parse_html(dongtai_data, encoding="gb18030") dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode( 'utf-8') if '店铺动态' in dongtai_title: microscope_data = dongtai_obj.xpath( "//*[@name='microscope-data']/@content") userId = get_val(str(microscope_data), "userId") if userId: dongtai_headers = { 'Referer': dongtai_url, 'User-Agent': DEFAULT_UA } promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \ "&userId=%s&vfeedTabId=115" % userId promotion_data = download(promotion_url, dongtai_headers) if promotion_data: promotion_obj = parse_html(promotion_data, encoding="gb18030") i = 0 while i < 10: feedInfo = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()" )[i].encode('utf-8') if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo: #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i] link = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href" )[i] promotion_price = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()" )[i] price = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()" )[i] promotion_time = promotion_obj.xpath( u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()" )[i] pt = promotion_time.encode( 'utf-8').replace("起止日期:", "").split(" - ") start_time = pt[0].replace(".", "-") end_time = pt[1].replace(".", "-") if '2013' not in pt[1] or '2014' not in pt[ 1]: end_time = '2013-' + end_time if start_time > end_time: end_time = end_time.replace( "2013", "2014") num_id = get_numiid(link, dongtai_headers) if num_id: sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % ( shop_id, num_id) re = list(db.execute(sql)) if not re: db.execute( "insert into shop_promotion (shop_id, num_id, price, " "promotion_price, start_time, end_time, create_time, " "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())" % (shop_id, num_id, price.replace(',', ''), promotion_price.replace( ',', ''), start_time, end_time)) else: logger.error( "shop %s:%s crawler num_id failed", shop_id, url) i += 1 logger.info( "shop %s:%s crawler promotiom item num=%s", shop_id, url, i) else: logger.warning("shop %s:%s not promotion info", shop_id, url) else: logger.error("shop %s:%s crawler userId failed", shop_id, url) else: logger.error("shop %s:%s not dongtai page", shop_id, url) except: logger.error("shop %s:%s crawler failed %s", shop_id, url, traceback.format_exc())
def crawl(): company_id = 10 url = "https://www.xinhehui.com/Financing/Invest/ajaxplist" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//table[@class='ui-record-table percentTable mt10']/tbody/tr") if len(loans) > 0: for loan in loans: if loan.xpath("td[last()]/a/@href")[0].encode("utf-8") == "javascript:;": #放弃已经结束的 continue href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8")) original_id = href.split("id%3D")[1].encode("utf-8").strip() if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) if loan.xpath("td[7]/div/a"): loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.xinhehui.com" + href title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip() if loan.xpath("td[1]/p[1]/a/em"): title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip() else: title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip() loan_obj.title = title_1 + title_2 borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "") if borrow_amount.find("万") > 0: loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000 else: loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", "")) if loan.xpath("td[4]/span"): period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip() else: period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "") loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip() if loan.xpath("td[7]/div/a"): loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 13 url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress=" request_headers = { 'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='bidList']/li") if len(loans) > 0: for loan in loans: if not loan.xpath( "div[2]/div[2]/div/div[@class='bid_empty_errortip']"): continue href = str(loan.xpath("div[2]/div/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str( float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.yirendai.com" + href loan_obj.title = str( loan.xpath("div[2]/div/h3/a/text()")[0].encode( "utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.rate = str( loan.xpath("div[2]/div/div[3]/h4/span/text()") [0].encode("utf-8")).strip() loan_obj.period = str( loan.xpath("div[2]/div/div[4]/h4/span/text()") [0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str( float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 20 url = "http://www.xiaomabank.com/finance.do" request_headers = {'Referee': "http://www.xiaomabank.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr") if len(loans) > 0: for loan in loans: href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.xiaomabank.com/" + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "") loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "") # 注意这里页面返回的gzip压缩后的,需要解压 resp = urllib2.urlopen(loan_obj.href) respInfo = resp.info() if(("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")): respHtml = zlib.decompress(resp.read(), 16+zlib.MAX_WBITS) info_htm_parse = parse_html(respHtml, encoding="utf-8") loan_obj.repayment = str(info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8")) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 15 url = "https://www.iqianbang.com/invest" request_headers = {"Referee": "https://www.iqianbang.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr") if len(loans) > 0: for loan in loans: if str(loan.xpath("td[7]/text()")[0].encode("utf-8")).strip() != "融资中": continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("-")[3].replace(".shtml", "") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.iqianbang.com" + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = ( str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace(",", "").replace("元", "") ) if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = ( str(loan.xpath("td[2]/span/span/text()")[0].encode("utf-8")).strip().replace("%", "") ) period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") # 这里需要进入详情页 loan_info_htm = download_page(loan_obj.href, headers={"Referee": url, "User-Agent": DEFAULT_UA}) loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8") loan_obj.repayment = str( loan_info_htm_parse.xpath("//div[@class='inright']/table[@class='idetable']")[0] .xpath("tr[2]/td[2]/span/text()")[0] .encode("utf-8") ).strip() loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 7 url = "http://www.jimubox.com/Project/List?status=1" request_headers = { 'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath( "//div[@class='row']/div[@class='span3 project-card']") if len(loans) > 0: for loan in loans: href = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h4/a/@href" )[0]) if not href.find("Index") > 0: continue original_id = href.split("/")[3].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.jimubox.com" + href loan_obj.title = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h4/a/text()" )[0].encode("utf-8")) loan_obj.description = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()" )[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\ .strip() + "0000" loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\ .strip() if rate.find("+") > 0: rate_list = rate.split("+") loan_obj.rate = str( float(rate_list[0]) + float(rate_list[1])) else: loan_obj.rate = rate loan_obj.repayment = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h6/span/text()" )[0].encode("utf-8")) loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\ .strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 16 # url = "http://www.itouzi.com/dinvest/invest/index" url = "http://www.itouzi.com/dinvest/debt/index" request_headers = {"Referee": "http://www.itouzi.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") # 注意ul的class后面有个空格 loans = loan_htm_parse.xpath("//ul[@class='invest-product-case-list mtn btn clearfix ']/li") if len(loans) > 0: for loan in loans: if not loan.xpath("div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']"): continue href = str(loan.xpath("h2/a[@class='fl']/@href")[0]) original_id = href.split("id=")[1] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) # loan_obj = Loan(company_id, original_id) # loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") # loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.itouzi.com" + href loan_obj.title = str(loan.xpath("h2/a[@class='fl']/text()")[0].encode("utf-8")).strip() loan_obj.repayment = ( str(loan.xpath("p/span[2]/text()")[0].encode("utf-8")).strip().replace("还款方式:", "") ) loan_obj.borrow_amount = int(loan.xpath("p/span[3]/strong/text()")[0]) * 10000 loan_obj.rate = ( str(loan.xpath("p/span[5]/em[1]/text()")[0].encode("utf-8")).strip().replace("%", "") ) period = str(loan.xpath("p/span[4]/strong/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH # 这个进度这块还不确定,需等有标时检查一遍 if loan.xpath("div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']"): loan_obj.schedule = ( str( loan.xpath( "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()" )[0].encode("utf-8") ) .strip() .replace("%", "") ) print loan_obj.schedule # loan_obj.db_create(db) # # logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # ## db - 新抓取的 = 就是要下线的 # off_ids_set = db_ids_set - online_ids_set # if off_ids_set: # loan_obj = Loan(company_id) # loan_obj.db_offline(db, off_ids_set) # logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl_title(self): try: self.data = self.crawl_page(self.url) if not self.data: logger.warn( "download %s %s page failed, possible network connection failure", self.item_id, self.num_id) return # check tmall if not self.is_tmall and len(self.data) < 256 and self.url.find( 'item.taobao.com' ) > 0 and self.data.find( "window.location.href='http://detail.tmall.com/item.htm'+window.location.search" ) > 0: self.data = self.crawl_page( self.url.replace('item.taobao.com', 'detail.tmall.com')) if self.check_offline(): self.is_offline = True self.html_obj = parse_html(self.data, encoding="gb18030") title = self.html_obj.xpath("//html/head/title/text()") if title and title[0].find(u"转卖") > 0: self.is_offline = True if title: self.title = title[0].encode('utf8').replace( "-淘宝网", "").replace("-tmall.com天猫", "") #tblogo = self.html_obj.xpath("//*[@id='shop-logo']") tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']") if not tmalllogo: tmalllogo = self.html_obj.xpath("//*[@id='simple-logo']") if not self.is_tmall and tmalllogo: self.is_tmall = True self.thumbImages = self.html_obj.xpath( "//ul[@id='J_UlThumb']//img/@src") if not len(self.thumbImages): try: # try load thumb images for tmall page self.thumbImages = [ IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style") ] # taobao @src to @data-src if not len(self.thumbImages): self.thumbImages = self.html_obj.xpath( "//ul[@id='J_UlThumb']//img/@data-src") except: logger.warn("No thumbs found %s", self.item_id) if self.is_tmall: self.cid = get_val(self.data, "categoryId").split('&')[0] else: self.cid = get_val(self.data, "cid") logger.info("Got %s %s html success", self.item_id, self.num_id) except: logger.error("crawling %s %s unknown exception %s", self.item_id, self.num_id, traceback.format_exc(), extra={'tags': [ 'crawlItemException', ]}) raise
def crawl_item2(kwargs): #signal.signal(signal.SIGINT, signal.SIG_IGN) item = kwargs['item'] is_commit = kwargs['is_commit'] crawl_path = kwargs['crawl_path'] server_path = kwargs['server_path'] org_server_path = kwargs['org_server_path'] is_remove = kwargs['is_remove'] item_id = item[0] num_id = item[1] is_success = False crawl_result = ((item_id, {'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0}),) try: conn = get_db_engine(**kwargs).connect() try: items = conn.execute("select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id) result = list(items) if len(result) == 1: html = result[0][0] desc_content = result[0][1] html_obj = parse_html(html) thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if len(thumbImages) == 0: thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")] # taobao @src to @data-src if not len(thumbImages): thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src") if len(thumbImages) == 0: logger.error("crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags':['crawl_thumb_empty',]}) return crawl_result r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M|re.S) tr = re.compile("(.*)_\d+x\d+\.jpg$") tr_new = re.compile("(.+\.(jpg|png|gif))[^$]*.jpg$") desc_thumbs = desc_table_thumbs = lazy_desc_thumbs = [] if desc_content: desc_html = r.subn(r'\2', desc_content)[0] desc_html_obj = parse_html(desc_html) if desc_html_obj is not None: desc_table_thumbs = desc_html_obj.xpath("//table/@background") desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img[not(@data-ks-lazyload)]/@src") lazy_desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img/@data-ks-lazyload") else: logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags':['crawl_nodesc',]}) images = [] pos = 1 for url in thumbImages: ori_url = None if tr.match(url): ori_url = tr.sub(r'\1', url) else: if tr_new.match(url): ori_url = tr_new.sub(r'\1', url) else: logger.error("crawl item %s %s thumb image urls can not be parsed!", item_id, num_id, extra={'tags':['crawl_exception',]}) images.append((ori_url, pos, 1)) pos += 1 for url in desc_table_thumbs: images.append((url, pos, 2)) pos += 1 for url in desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 2)) pos += 1 for url in lazy_desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 3)) pos += 1 logger.debug("crawling %s %s %s", item_id, num_id, images) item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport']) item_crawler.crawl(images, ((710,10000),), is_commit, conn, is_remove) is_success = item_crawler.success crawl_result = ((item_id, item_crawler.summary),) except Exception, e: logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags':['crawl_exception',]}) finally: conn.close() Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'], host = kwargs['statshost'], port = kwargs['statsport']) if is_success: logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result) Statsd.increment('guang.crawl.itemimg.succ', host = kwargs['statshost'], port = kwargs['statsport']) else: logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags':['crawl_failed',]}) Statsd.increment('guang.crawl.itemimg.failed', host = kwargs['statshost'], port = kwargs['statsport'])
def crawl(): company_id = 12 url = "http://www.renrendai.com/lend/loanList.action" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans_script = htm_obj.xpath("//script[@id='loan-list-rsp']/text()")[0].encode("utf-8") loans_json = loads(loans_script, encoding="UTF-8") loan_size = len(loans_json["data"]["loans"]) if loan_size > 0: for i in range(0, loan_size): if loans_json["data"]["loans"][i]["status"] != "OPEN": #放弃已经结束的 continue original_id = str(int(loans_json["data"]["loans"][i]["loanId"])) href = "http://www.renrendai.com/lend/detailPage.action?loanId=%s" % original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0] loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"])) loan_obj.db_update(db) else: pass new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = str(loans_json["data"]["loans"][i]["title"].encode("utf-8")) loan_obj.borrow_amount = str(loans_json["data"]["loans"][i]["amount"]) loan_obj.period = str(int(loans_json["data"]["loans"][i]["months"])) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loans_json["data"]["loans"][i]["interest"]) loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"])) loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 23 url = "https://member.niwodai.com/xiangmu/" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm, encoding="utf-8") loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\ .replace("共", "").replace("个标", "").strip()) if loan_size > 0: page = loan_size / 10 if loan_size % 10 > 0: page += 1 for p in range(1, page + 1): page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % ( p, loan_size) page_html = download_page(page_url, request_headers) page_obj = parse_html(page_html, encoding="utf-8") loans = page_obj.xpath( "//div[@class='biaoList']/table/tbody/tr") for loan in loans: if lxml.html.tostring(loan).find("<th>") > 0: continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.replace(".html", "").split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( loan.xpath("td[5]/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = REFEREE + href loan_obj.title = str( loan.xpath("td[1]/a/text()")[0].encode( "utf-8")).strip() loan_obj.borrow_amount = str( loan.xpath("td[4]/em/text()")[0].encode( "utf-8")).strip().replace(",", "") loan_obj.rate = str( loan.xpath("td[2]/em/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.period = str( loan.xpath("td[3]/em/text()")[0].encode( "utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY loan_obj.schedule = str( loan.xpath("td[5]/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 21 url = "http://www.id68.cn/invest/index/borrow_status/9/p/1.html" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//ul[@class='ideal_con']/li") if len(loans) > 0: for loan in loans: if str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]) == "100.00%": #放弃已经结束的 continue href = str(loan.xpath("table/tr[1]/td[1]/a/@href")[0].encode("utf-8")) original_id = href.replace(".html", "").split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = REFEREE + href loan_obj.title = str(loan.xpath("table/tr[1]/td[1]/a/@title")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("table/tr[2]/td[last()]/span/text()")[0].encode("utf-8"))\ .strip().replace(" ", "").replace(",", "") loan_obj.repayment = str(loan.xpath("table/tr[2]/td[4]/span/text()")[0].encode("utf-8")).strip() loan_obj.rate = str(loan.xpath("table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("table/tr[2]/td[3]/span/text()")[0].encode("utf-8")).strip()\ .replace(" ", "").replace("个月", "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 8 url = "http://www.eloancn.com/new/loadAllTender.action?page=3&sidx=progress&sord=desc" request_headers = { 'Referee': "http://www.eloancn.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: for p in range(1, 4): url = "http://www.eloancn.com/new/loadAllTender.action?page=%s" % p logger.info("page url:%s", url) # 这个页面比较恶心,一个标的的属性不在一个div内 loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") htm_1 = loan_htm_parse.xpath( "//div[@class='lendtable']/dl/dd[@class='wd300 pdl10 fl']") htm_2 = loan_htm_parse.xpath( "//div[@class='lendtable']/dl/dd[@class='wd140 fl']") htm_3 = loan_htm_parse.xpath( "//div[@class='lendtable']/dl/dd[@class='wd130 fl pdl10']") htm_4 = loan_htm_parse.xpath( "//div[@class='lendtable']/dl/dd[@class='wd130 fl']") loan_list = [] for h1 in htm_1: loan_obj = Loan(company_id) loan_obj.title = str( h1.xpath("h3/a[@class='fl']/text()")[0].encode("utf-8")) loan_obj.href = str( h1.xpath("h3/a[@class='fl']/@href")[0]).replace(":80", "") loan_obj.original_id = loan_obj.href.split("=")[1] loan_list.append(loan_obj) for index, h2 in enumerate(htm_2): loan_list[index].borrow_amount = str( h2.xpath("p[@class='colorCb mt10']/text()")[0].encode( "utf-8")).replace("¥", "").replace(",", "") loan_list[index].rate = str( h2.xpath("p[@class='colorE6']/span/text()")[0]).replace( "%", "") for index, h3 in enumerate(htm_3): loan_list[index].period = str( h3.xpath("p/span/text()")[0].encode("utf-8")) loan_list[index].period_unit = loan_obj.PERIOD_UNIT_MONTH loan_list[index].repayment = str( h3.xpath("p[@class='']/text()")[0].encode("utf-8")) for index, h4 in enumerate(htm_4): loan_list[index].schedule = str( h4.xpath("p/span/em/text()")[0]).strip().replace("%", "") # 去掉已经满标的 new_list = [i for i in loan_list if i.schedule != "100"] for loan in new_list: online_ids_set.add(loan.original_id) if loan.original_id in db_ids_set: update_ids_set.add(loan.original_id) loan.db_update(db) else: new_ids_set.add(loan.original_id) loan.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) time.sleep(5) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj_off = Loan(company_id) loan_obj_off.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 3 url = "http://www.91wangcai.com/invest/index.html" request_headers = {'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="gb2312") loans = loan_htm_parse.xpath("//div[@class='proBoxNew']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='hd']/a/@href")[0]) original_id = href.split(".")[0].split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.91wangcai.com" + href loan_obj.title = autodecode(str(loan.xpath("div[@class='hd']/a/text()")[0].encode("gb2312"))).encode("utf-8") loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("¥", "") loan_obj.rate = str(loan.xpath("div[@class='bd']/table/tr[1]/td[2]/em/text()")[0]).strip().replace("%", "") loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \ .replace("<em>", "").replace("</em>", "") html_parser = HTMLParser.HTMLParser() period = html_parser.unescape(loan_period_text).encode("utf-8").strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("还款方式:", "") loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 26 url = "http://www.htyd50.com/trade/borrow/bidding.htm" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//div[@class='page_block']/div[@class='page_block_content']/div[@class='min_height_300 mb_30']/div[@class='w980 clearfix']") print len(loans) if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[2]/div[1]/div[1]/a/@href")[0].encode("utf-8")) original_id = href.replace(".html", "").split("/")[5].strip() print href, original_id # if original_id: # online_ids_set.add(original_id) # # if original_id in db_ids_set: # update_ids_set.add(original_id) # # loan_obj = Loan(company_id, original_id) # if loan.xpath("td[7]/div/a"): # loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") # else: # loan_obj.schedule = "0" # loan_obj.db_update(db) # else: # new_ids_set.add(original_id) # # loan_obj = Loan(company_id, original_id) # loan_obj.href = "https://www.xinhehui.com" + href # title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip() # if loan.xpath("td[1]/p[1]/a/em"): # title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip() # else: # title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip() # loan_obj.title = title_1 + title_2 # borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "") # if borrow_amount.find("万") > 0: # loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000 # else: # loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", "")) # # if loan.xpath("td[4]/span"): # period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip() # else: # period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip() # if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: # loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") # loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY # else: # loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") # loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH # # loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "") # loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip() # if loan.xpath("td[7]/div/a"): # loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") # else: # loan_obj.schedule = "0" # # loan_obj.db_create(db) # #logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # ## db - 新抓取的 = 就是要下线的 #off_ids_set = db_ids_set - online_ids_set #if off_ids_set: # loan_obj = Loan(company_id) # loan_obj.db_offline(db, off_ids_set) # logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 7 url = "http://www.jimubox.com/Project/List?status=1" request_headers = {'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='row']/div[@class='span3 project-card']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/@href")[0]) if not href.find("Index") > 0: continue original_id = href.split("/")[3].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.jimubox.com" + href loan_obj.title = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/text()")[0].encode("utf-8")) loan_obj.description = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\ .strip() + "0000" loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\ .strip() if rate.find("+") > 0: rate_list = rate.split("+") loan_obj.rate = str(float(rate_list[0]) + float(rate_list[1])) else: loan_obj.rate = rate loan_obj.repayment = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h6/span/text()")[0].encode("utf-8")) loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\ .strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 23 url = "https://member.niwodai.com/xiangmu/" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm, encoding="utf-8") loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\ .replace("共", "").replace("个标", "").strip()) if loan_size > 0: page = loan_size / 10 if loan_size % 10 > 0: page += 1 for p in range(1, page+1): page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % (p, loan_size) page_html = download_page(page_url, request_headers) page_obj = parse_html(page_html, encoding="utf-8") loans = page_obj.xpath("//div[@class='biaoList']/table/tbody/tr") for loan in loans: if lxml.html.tostring(loan).find("<th>") > 0: continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.replace(".html", "").split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = REFEREE + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[4]/em/text()")[0].encode("utf-8")).strip().replace(",", "") loan_obj.rate = str(loan.xpath("td[2]/em/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("td[3]/em/text()")[0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl_wzdai(): url = "https://www.wzdai.com/invest/index.html?status=1&page=1&order=-3" request_headers = {'Referee': "https://www.wzdai.com", 'User-Agent': DEFAULT_UA} company_id = 3 db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) pages_obj = htm_obj.xpath("//div[@class='page']/div[@align='center']/span/text()")[0] page = int(str(pages_obj.encode("utf-8")).split("条")[1].split("页")[0]) for p in range(1, page + 1): url = "https://www.wzdai.com/invest/index.html?status=1&page=" + str(p) + "&order=-3" loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath("//div[@class='invest_box']") if len(loans) > 0: for loan in loans: href = "https://www.wzdai.com" + str(loan.xpath("h1/a[@class='del']/@href")[0]) title = loan.xpath("h1/a[@class='del']/text()")[0].strip().encode("UTF-8") borrow_amount = str(loan.xpath("div[@class='invest_box_Info']/div[@class='prize']/span/b/text()")[0]) rate = str(loan.xpath("div[@class='invest_box_Info']/div[@class='prize']/font/b/text()")[0]) text = loan.xpath("div[@class='invest_box_Info']/div[@class='text']") loan_period = "" repayment = "" for lp in text: p = lxml.html.tostring(lp).strip().replace("\r\n", "").split("<br>") html_parser = HTMLParser.HTMLParser() loan_period = html_parser.unescape(p[0].replace('<div class="text">', "").strip()).encode("UTF-8").replace("借款期限:", "") repayment = html_parser.unescape(p[1].strip()).encode("UTF-8").replace("还款方式:", "") cast = loan.xpath("div[@class='invest_box_Info']/div[@class='text2']/text()")[0].strip()\ .encode("UTF-8").replace("已投:¥", "").replace("元","") schedule = str(loan.xpath("div[@class='invest_box_Info']/div[@class='percent_big']/div[@class='percent_small']/font/text()")[0]) logger.info(href,title,borrow_amount,rate,cast,schedule,loan_period, repayment) db = get_db_engine() db.execute("insert into loan (company_id,url,title,borrow_amount,rate,loan_period," "repayment,cast,schedule,crawl_status,status,create_time,update_time) " "values (1,%s,%s,%s,%s,%s,%s,%s,%s,0,0,now(),now())", href, title, borrow_amount, rate,loan_period,repayment,cast,schedule) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 14 url = "http://www.licaifan.com" request_headers = {'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='main-list tab-con2']/li[1]/table/tr") if len(loans) > 0: # 这里注意第一行是表单标题,不需要,所以从1开始 for i in range(1, len(loans)): if str(loans[i].xpath("td[last()]/a/text()")[0].encode("utf-8")) == "投资满额": continue href = str(loans[i].xpath("td[1]/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.licaifan.com" + href loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\ .strip().replace(",", "") if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = str(loans[i].xpath("td[2]/text()")[0].encode("utf-8")).strip().replace("%", "") period = str(loans[i].xpath("td[4]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def get_taobao_shop_favorite_count(the_shop, shop_html_obj, urls): """获取淘宝店被收藏数目""" try: favorite_count_success = False favorite_param = shop_html_obj.xpath( u"//div[@class='item collect-num']/span[contains(@data-info,'SCCP')]/@data-info" ) if favorite_param: the_param = favorite_param[0].split('&') favorite_url = "%s?callback=jsonp%d&t=%s&keys=%s" % ( the_param[1].split('=')[1], random.randint(1000, 9999), str(int(time.time() * 1000)), the_param[0].split('=')[1]) urls['favorite_url'] = favorite_url favorite_html = download_with_referer(favorite_url, urls['shop_rate_url']) if favorite_html: logger.debug( "download shop favorite html. shop_id: %d, url: %s. html length: %d." % (the_shop.get_shop_id(), favorite_url, len(favorite_html))) the_shop.favorited_user_count = int( favorite_num_reg.search(favorite_html).group(1)) favorite_count_success = True else: logger.error( "download shop favorite html error. shop_id: %d, url: %s." % (the_shop.get_shop_id(), favorite_url)) if not favorite_count_success: logger.debug("use pattern left edition to get favorite data ") favorite_param = shop_html_obj.xpath( u"//li[@id='J_SCollCount'][@data-info]/@data-info") if favorite_param: the_param = favorite_param[0].split('&') favorite_url = "%s?t=%s&keys=%s&callback=setShopStat" % ( the_param[1].split('=')[1], str(int( time.time() * 1000)), the_param[0].split('=')[1]) favorite_html = download_with_referer(favorite_url, urls['shop_rate_url']) if favorite_html: the_shop.favorited_user_count = int( favorite_num_reg.search(favorite_html).group(1)) favorite_count_success = True if not favorite_count_success: logger.debug("use pattern for old edition to get favorite data ") shop_description_url = shop_html_obj.xpath( u"//a[@title='店铺介绍']/@href") if shop_description_url: shop_description_html = download_with_referer( shop_description_url[0], urls['shop_rate_url']) if shop_description_html: shop_description_html_obj = parse_html( shop_description_html, 'gbk') favorite_param = shop_description_html_obj.xpath( u"//li[@id='J_SCollCount'][@data-info]/@data-info") if favorite_param: the_param = favorite_param[0].split('&') favorite_url = "%s?t=%s&keys=%s&callback=setShopStat" % ( the_param[1].split('=')[1], str(int(time.time() * 1000)), the_param[0].split('=')[1]) favorite_html = download_with_referer( favorite_url, shop_description_url) if favorite_html: the_shop.favorited_user_count = int( favorite_num_reg.search(favorite_html).group( 1)) favorite_count_success = True if not favorite_count_success: logger.error("get shop favorite count failed. shop_id: %d." % the_shop.get_shop_id()) except: logger.error( "get shop favorite count failed. shop_id: %s. error info: %s" % (the_shop.get_shop_id(), traceback.format_exc()))
def crawl(): company_id = 11 url = "https://www.tzydb.com" request_headers = {'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//div[@id='proList']/ul[@class='item_li']") if len(loans) > 0: for loan in loans: schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip() if schedule == "100%" or schedule == "100.0%": #放弃已经结束的 continue # link = https://www.tzydb.com/boot/lookup/971,1017 a_script = str(loan.xpath("li/div[1]/div[1]/div/a/@href")[0].encode("utf-8")) o_id = ID_RE.findall(a_script)[0] original_id = o_id.replace(",", "-") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.tzydb.com/boot/lookup/" + o_id loan_obj.title = str(loan.xpath("li/div[1]/div[1]/div/a/text()")[0].encode("utf-8")) loan_obj.borrow_amount = str(loan.xpath("li/div[2]/div[1]/span/text()")[0].encode("utf-8")).strip()\ .replace(" ", "").replace(",", "") loan_obj.period = str(loan.xpath("li/div[2]/div[3]/span/text()")[0].encode("UTF-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loan.xpath("li/div[2]/div[2]/span/text()")[0]).strip().replace("%", "") loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 26 url = "http://www.longlongweb.com/invests" request_headers = {'Referee': "http://www.longlongweb.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="utf-8") loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl") if len(loans) > 0: for loan in loans: if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0: continue href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0]) original_id = href.split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\ .replace(",", "").replace("¥", "").strip() loan_obj.href = "http://www.longlongweb.com" + href loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0] .encode("utf-8")).replace(",", "").replace("¥", "").strip() loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.longlongweb.com" + href loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8")) loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8"))\ .replace("%", "") loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\ .replace(",", "").replace("¥", "").strip() loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0] .encode("utf-8")).replace(",", "").replace("¥", "").strip() loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.repayment = str(loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0] .encode("utf-8")).strip().replace("还款方式:", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawler(sql): db = get_db_engine() shops = list(db.execute(sql)) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() for shop in shops: shop_id = shop[0] url = str(shop[1]) type = shop[2] if url[-1] != '/': url += "/" try: shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} dongtai_url = url + "dongtai.htm" dongtai_data = download(dongtai_url, shop_headers) if dongtai_data: dongtai_obj = parse_html(dongtai_data, encoding="gb18030") dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode('utf-8') if '店铺动态' in dongtai_title: microscope_data = dongtai_obj.xpath("//*[@name='microscope-data']/@content") userId = get_val(str(microscope_data), "userId") if userId: dongtai_headers = {'Referer': dongtai_url, 'User-Agent': DEFAULT_UA} promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \ "&userId=%s&vfeedTabId=115" % userId promotion_data = download(promotion_url, dongtai_headers) if promotion_data: promotion_obj = parse_html(promotion_data, encoding="gb18030") i = 0 while i < 10: feedInfo = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()")[i].encode('utf-8') if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo: #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i] link = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href")[i] promotion_price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()")[i] price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()")[i] promotion_time = promotion_obj.xpath(u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()")[i] pt = promotion_time.encode('utf-8').replace("起止日期:","").split(" - ") start_time = pt[0].replace(".", "-") end_time = pt[1].replace(".", "-") if '2013' not in pt[1] or '2014' not in pt[1]: end_time = '2013-' + end_time if start_time > end_time: end_time = end_time.replace("2013", "2014") num_id = get_numiid(link, dongtai_headers) if num_id: sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % (shop_id, num_id) re = list(db.execute(sql)) if not re: db.execute("insert into shop_promotion (shop_id, num_id, price, " "promotion_price, start_time, end_time, create_time, " "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())" % (shop_id, num_id, price.replace(',', ''), promotion_price.replace(',', ''), start_time, end_time)) else: logger.error("shop %s:%s crawler num_id failed", shop_id, url) i += 1 logger.info("shop %s:%s crawler promotiom item num=%s", shop_id, url, i) else: logger.warning("shop %s:%s not promotion info", shop_id, url) else: logger.error("shop %s:%s crawler userId failed", shop_id, url) else: logger.error("shop %s:%s not dongtai page", shop_id, url) except: logger.error("shop %s:%s crawler failed %s", shop_id, url, traceback.format_exc())
def crawl(): company_id = 16 #url = "http://www.itouzi.com/dinvest/invest/index" url = "http://www.itouzi.com/dinvest/debt/index" request_headers = { 'Referee': "http://www.itouzi.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") # 注意ul的class后面有个空格 loans = loan_htm_parse.xpath( "//ul[@class='invest-product-case-list mtn btn clearfix ']/li") if len(loans) > 0: for loan in loans: if not loan.xpath( "div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']" ): continue href = str(loan.xpath("h2/a[@class='fl']/@href")[0]) original_id = href.split("id=")[1] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) #loan_obj = Loan(company_id, original_id) #loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") #loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.itouzi.com" + href loan_obj.title = str( loan.xpath("h2/a[@class='fl']/text()")[0].encode( "utf-8")).strip() loan_obj.repayment = str(loan.xpath("p/span[2]/text()")[0].encode("utf-8"))\ .strip().replace("还款方式:", "") loan_obj.borrow_amount = int( loan.xpath("p/span[3]/strong/text()")[0]) * 10000 loan_obj.rate = str( loan.xpath("p/span[5]/em[1]/text()")[0].encode( "utf-8")).strip().replace("%", "") period = str( loan.xpath("p/span[4]/strong/text()")[0].encode( "utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH # 这个进度这块还不确定,需等有标时检查一遍 if loan.xpath( "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']" ): loan_obj.schedule = str( loan.xpath( "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()" )[0].encode("utf-8")).strip().replace("%", "") print loan_obj.schedule #loan_obj.db_create(db) # # logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # ## db - 新抓取的 = 就是要下线的 #off_ids_set = db_ids_set - online_ids_set #if off_ids_set: # loan_obj = Loan(company_id) # loan_obj.db_offline(db, off_ids_set) # logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 2 url = "http://www.ppdai.com/lend/12_s1_p1" request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='fen_ye_nav']/table/tr/td[last()]/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "http://www.ppdai.com/lend/12_s1_p" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath("//div[@class='lend_nav']/table/tr") if len(loans) > 0: for loan in loans: if lxml.html.tostring(loan).find("tit_nav") > 0: continue href = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@href")[0]) original_id = href.split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1] loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id) loan_obj.original_id = original_id loan_obj.href = "http://www.ppdai.com" + href loan_obj.title = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@title")[0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str(loan.xpath("td[4]/text()")[0]).strip().replace("%", "") period = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "") if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = float(str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1]) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 15 url = "https://www.iqianbang.com/invest" request_headers = { 'Referee': "https://www.iqianbang.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr") if len(loans) > 0: for loan in loans: if str(loan.xpath("td[7]/text()")[0].encode( "utf-8")).strip() != "融资中": continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("-")[3].replace(".shtml", "") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( loan.xpath("td[6]/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.iqianbang.com" + href loan_obj.title = str( loan.xpath("td[1]/a/text()")[0].encode( "utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("utf-8"))\ .strip().replace(",", "").replace("元", "") if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int( loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = str( loan.xpath("td[2]/span/span/text()")[0].encode( "utf-8")).strip().replace("%", "") period = str( loan.xpath("td[4]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str( loan.xpath("td[6]/text()")[0].encode( "utf-8")).strip().replace("%", "") # 这里需要进入详情页 loan_info_htm = download_page(loan_obj.href, headers={ 'Referee': url, 'User-Agent': DEFAULT_UA }) loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8") loan_obj.repayment = str( loan_info_htm_parse.xpath( "//div[@class='inright']/table[@class='idetable']") [0].xpath("tr[2]/td[2]/span/text()")[0].encode( "utf-8")).strip() loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 18 url = "https://www.my089.com/Loan/default.aspx?pid=1" request_headers = { 'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath( "//div[@class='Loan_box']/dl[@class='LoanList']") if len(loans) > 0: for loan in loans: if str(loan.xpath("dd[last()]/p/span/text()") [0]) == "100%": continue href = str( loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( loan.xpath("dd[last()]/p/span/text()")[0].encode( "UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.my089.com/Loan/" + href loan_obj.title = str( loan.xpath("dd[2]/div[@class='txt_tou']/a/@title") [0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str( loan.xpath("dd[3]/span/text()")[0].encode( "UTF-8")).strip().replace("%/年", "") loan_obj.period = str( loan.xpath("dd[5]/span/text()")[0].encode( "UTF-8")).strip().replace(" ", "") s = str(loan.xpath("dd[5]/text()")[0].encode( "UTF-8")).strip().replace(" ", "").replace("个", "") loan_obj.period_unit = s.split("/")[0].strip() loan_obj.repayment = s.split("/")[1].strip() loan_obj.schedule = str( loan.xpath("dd[last()]/p/span/text()")[0].encode( "UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 4 url = "https://www.yinhu.com/loan/loan_list.bl" request_headers = {'Referee': "https://www.yinhu.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() # offline off_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@id='loan_list']/table/tbody/tr") if len(loans) > 0: for loan in loans: href = str(loan.xpath("td[1]/p/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") try: loan_status = str(loan.xpath("td[last()]/em/span/text()")[0].encode("utf-8")).strip() except: loan_status = str(loan.xpath("td[last()]/a/span/text()")[0].encode("utf-8")).strip() if original_id and loan_status != "还款中": online_ids_set.add(original_id) if loan_status == "还款中" or loan_status == "满标": if original_id in db_ids_set: off_ids_set.add(original_id) continue if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\ .strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.yinhu.com" + href loan_obj.title = str(loan.xpath("td[1]/p/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip().replace(",", "")\ .replace("元", "") loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip() period = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\ .strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 10 url = "https://www.xinhehui.com/Financing/Invest/ajaxplist" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath( "//table[@class='ui-record-table percentTable mt10']/tbody/tr") if len(loans) > 0: for loan in loans: if loan.xpath("td[last()]/a/@href")[0].encode( "utf-8") == "javascript:;": #放弃已经结束的 continue href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8")) original_id = href.split("id%3D")[1].encode("utf-8").strip() if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) if loan.xpath("td[7]/div/a"): loan_obj.schedule = str( loan.xpath("td[7]/div/a/text()")[0].encode( "UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.xinhehui.com" + href title_1 = str( loan.xpath("td[1]/p[1]/a/text()")[0].encode( "utf-8")).strip() if loan.xpath("td[1]/p[1]/a/em"): title_2 = str( loan.xpath("td[1]/p[1]/a/em/text()")[0].encode( "utf-8")).strip() else: title_2 = str( loan.xpath("td[1]/p[1]/a/span/text()")[0].encode( "utf-8")).strip() loan_obj.title = title_1 + title_2 borrow_amount = str( loan.xpath("td[2]/span/text()")[0].encode( "utf-8")).strip().replace(" ", "") if borrow_amount.find("万") > 0: loan_obj.borrow_amount = float( borrow_amount.replace("万", "")) * 10000 else: loan_obj.borrow_amount = float( borrow_amount.replace("元", "").replace(",", "")) if loan.xpath("td[4]/span"): period = str( loan.xpath("td[4]/span/@title")[0].encode( "UTF-8")).strip() else: period = str( loan.xpath("td[4]/text()")[0].encode( "UTF-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str( loan.xpath("td[3]/p/text()")[0]).strip().replace( "%", "") loan_obj.repayment = str( loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip() if loan.xpath("td[7]/div/a"): loan_obj.schedule = str( loan.xpath("td[7]/div/a/text()")[0].encode( "UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 14 url = "http://www.licaifan.com" request_headers = {'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath( "//ul[@class='main-list tab-con2']/li[1]/table/tr") if len(loans) > 0: # 这里注意第一行是表单标题,不需要,所以从1开始 for i in range(1, len(loans)): if str(loans[i].xpath("td[last()]/a/text()")[0].encode( "utf-8")) == "投资满额": continue href = str(loans[i].xpath("td[1]/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.licaifan.com" + href loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()") [0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\ .strip().replace(",", "") if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int( loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = str(loans[i].xpath("td[2]/text()") [0].encode("utf-8")).strip().replace( "%", "") period = str(loans[i].xpath("td[4]/text()")[0].encode( "utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())