def crawl(): company_id = 14 url = "http://www.licaifan.com" request_headers = {'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='main-list tab-con2']/li[1]/table/tr") if len(loans) > 0: # 这里注意第一行是表单标题,不需要,所以从1开始 for i in range(1, len(loans)): if str(loans[i].xpath("td[last()]/a/text()")[0].encode("utf-8")) == "投资满额": continue href = str(loans[i].xpath("td[1]/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.licaifan.com" + href loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\ .strip().replace(",", "") if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = str(loans[i].xpath("td[2]/text()")[0].encode("utf-8")).strip().replace("%", "") period = str(loans[i].xpath("td[4]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 4 url = "https://www.yinhu.com/loan/loan_list.bl" request_headers = {'Referee': "https://www.yinhu.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() # offline off_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@id='loan_list']/table/tbody/tr") if len(loans) > 0: for loan in loans: href = str(loan.xpath("td[1]/p/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") try: loan_status = str(loan.xpath("td[last()]/em/span/text()")[0].encode("utf-8")).strip() except: loan_status = str(loan.xpath("td[last()]/a/span/text()")[0].encode("utf-8")).strip() if original_id and loan_status != "还款中": online_ids_set.add(original_id) if loan_status == "还款中" or loan_status == "满标": if original_id in db_ids_set: off_ids_set.add(original_id) continue if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\ .strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.yinhu.com" + href loan_obj.title = str(loan.xpath("td[1]/p/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip().replace(",", "")\ .replace("元", "") loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip() period = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\ .strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 19 url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0" request_headers = { 'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") if loans_json["list"]: for i in range(0, len(loans_json["list"])): if int(loans_json["list"][i]["status"]) != 1: continue original_id = str(loans_json["list"][i]["borrowId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str( int(loans_json["list"][i]["accountYes"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id loan_obj.title = loans_json["list"][i]["name"] loan_obj.rate = str(loans_json["list"][i]["apr"]) loan_obj.period = str(loans_json["list"][i]["totalPeriod"]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.borrow_amount = str( int(loans_json["list"][i]["account"])) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str( int(loans_json["list"][i]["accountYes"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 26 url = "http://www.longlongweb.com/invests" request_headers = {'Referee': "http://www.longlongweb.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="utf-8") loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl") if len(loans) > 0: for loan in loans: if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0: continue href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0]) original_id = href.split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\ .replace(",", "").replace("¥", "").strip() loan_obj.href = "http://www.longlongweb.com" + href loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0] .encode("utf-8")).replace(",", "").replace("¥", "").strip() loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.longlongweb.com" + href loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8")) loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8"))\ .replace("%", "") loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\ .replace(",", "").replace("¥", "").strip() loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0] .encode("utf-8")).replace(",", "").replace("¥", "").strip() loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.repayment = str(loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0] .encode("utf-8")).strip().replace("还款方式:", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 15 url = "https://www.iqianbang.com/invest" request_headers = { 'Referee': "https://www.iqianbang.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr") if len(loans) > 0: for loan in loans: if str(loan.xpath("td[7]/text()")[0].encode( "utf-8")).strip() != "融资中": continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("-")[3].replace(".shtml", "") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( loan.xpath("td[6]/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.iqianbang.com" + href loan_obj.title = str( loan.xpath("td[1]/a/text()")[0].encode( "utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("utf-8"))\ .strip().replace(",", "").replace("元", "") if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int( loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = str( loan.xpath("td[2]/span/span/text()")[0].encode( "utf-8")).strip().replace("%", "") period = str( loan.xpath("td[4]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str( loan.xpath("td[6]/text()")[0].encode( "utf-8")).strip().replace("%", "") # 这里需要进入详情页 loan_info_htm = download_page(loan_obj.href, headers={ 'Referee': url, 'User-Agent': DEFAULT_UA }) loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8") loan_obj.repayment = str( loan_info_htm_parse.xpath( "//div[@class='inright']/table[@class='idetable']") [0].xpath("tr[2]/td[2]/span/text()")[0].encode( "utf-8")).strip() loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 18 url = "https://www.my089.com/Loan/default.aspx?pid=1" request_headers = { 'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath( "//div[@class='Loan_box']/dl[@class='LoanList']") if len(loans) > 0: for loan in loans: if str(loan.xpath("dd[last()]/p/span/text()") [0]) == "100%": continue href = str( loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( loan.xpath("dd[last()]/p/span/text()")[0].encode( "UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.my089.com/Loan/" + href loan_obj.title = str( loan.xpath("dd[2]/div[@class='txt_tou']/a/@title") [0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str( loan.xpath("dd[3]/span/text()")[0].encode( "UTF-8")).strip().replace("%/年", "") loan_obj.period = str( loan.xpath("dd[5]/span/text()")[0].encode( "UTF-8")).strip().replace(" ", "") s = str(loan.xpath("dd[5]/text()")[0].encode( "UTF-8")).strip().replace(" ", "").replace("个", "") loan_obj.period_unit = s.split("/")[0].strip() loan_obj.repayment = s.split("/")[1].strip() loan_obj.schedule = str( loan.xpath("dd[last()]/p/span/text()")[0].encode( "UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 21 url = "http://www.id68.cn/invest/index/borrow_status/9/p/1.html" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//ul[@class='ideal_con']/li") if len(loans) > 0: for loan in loans: if str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]) == "100.00%": #放弃已经结束的 continue href = str(loan.xpath("table/tr[1]/td[1]/a/@href")[0].encode("utf-8")) original_id = href.replace(".html", "").split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = REFEREE + href loan_obj.title = str(loan.xpath("table/tr[1]/td[1]/a/@title")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("table/tr[2]/td[last()]/span/text()")[0].encode("utf-8"))\ .strip().replace(" ", "").replace(",", "") loan_obj.repayment = str(loan.xpath("table/tr[2]/td[4]/span/text()")[0].encode("utf-8")).strip() loan_obj.rate = str(loan.xpath("table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("table/tr[2]/td[3]/span/text()")[0].encode("utf-8")).strip()\ .replace(" ", "").replace("个月", "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 23 url = "https://member.niwodai.com/xiangmu/" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm, encoding="utf-8") loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\ .replace("共", "").replace("个标", "").strip()) if loan_size > 0: page = loan_size / 10 if loan_size % 10 > 0: page += 1 for p in range(1, page+1): page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % (p, loan_size) page_html = download_page(page_url, request_headers) page_obj = parse_html(page_html, encoding="utf-8") loans = page_obj.xpath("//div[@class='biaoList']/table/tbody/tr") for loan in loans: if lxml.html.tostring(loan).find("<th>") > 0: continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.replace(".html", "").split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = REFEREE + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[4]/em/text()")[0].encode("utf-8")).strip().replace(",", "") loan_obj.rate = str(loan.xpath("td[2]/em/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("td[3]/em/text()")[0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 19 url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0" request_headers = {'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") if loans_json["list"]: for i in range(0, len(loans_json["list"])): if int(loans_json["list"][i]["status"]) != 1: continue original_id = str(loans_json["list"][i]["borrowId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str(int(loans_json["list"][i]["accountYes"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id loan_obj.title = loans_json["list"][i]["name"] loan_obj.rate = str(loans_json["list"][i]["apr"]) loan_obj.period = str(loans_json["list"][i]["totalPeriod"]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.borrow_amount = str(int(loans_json["list"][i]["account"])) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str(int(loans_json["list"][i]["accountYes"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 24 url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["list"] if len(loans) > 0: for loan in loans: if str(loan["bID_SCHD"]) == "100": #放弃已经结束的 continue original_id = loan["lN_NO"] href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["bID_SCHD"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["lN_NM"] loan_obj.borrow_amount = loan["lN_AMT"] loan_obj.rate = loan["lN_RATE"] loan_obj.period = loan["lN_TERM"] loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"] loan_obj.schedule = str(loan["bID_SCHD"]) loan_obj.repayment = loan["pAY_METH_DESC"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 9 url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1" request_headers = {'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loan_num = loans_json["totalCount"] if loans_json and loan_num: for i in range(0, loan_num): original_id = str(loans_json["data"][i]["productId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id loan_obj.title = loans_json["data"][i]["productNameDisplay"] loan_obj.rate = str(float(loans_json["data"][i]["interestRate"]) * 100) period = str(loans_json["data"][i]["investPeriodDisplay"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = loans_json["data"][i]["collectionModeDisplay"] loan_obj.borrow_amount = str(int(loans_json["data"][i]["price"])) loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 22 url = "https://www.weidai.com.cn/?m=Biao&t=today&pageIndex=1&pageSize=8&sortField=b.verify_time&sortOrder=desc&data=null" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["data"] if len(loans) > 0: for loan in loans: if str(loan["borrow_account_scale"]) == "100.00": #放弃已经结束的 continue original_id = loan["uid"] href = "https://www.weidai.com.cn/page/borinfo.html?uid=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["borrow_account_scale"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["name"] loan_obj.borrow_amount = loan["account"] loan_obj.rate = loan["borrow_apr"] loan_obj.period = loan["borrow_period"] loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan["borrow_account_scale"]) loan_obj.cast = loan["borrow_account_yes"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 18 url = "https://www.my089.com/Loan/default.aspx?pid=1" request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath("//div[@class='Loan_box']/dl[@class='LoanList']") if len(loans) > 0: for loan in loans: if str(loan.xpath("dd[last()]/p/span/text()")[0]) == "100%": continue href = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.my089.com/Loan/" + href loan_obj.title = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")[0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str(loan.xpath("dd[3]/span/text()")[0].encode("UTF-8")).strip().replace("%/年", "") loan_obj.period = str(loan.xpath("dd[5]/span/text()")[0].encode("UTF-8")).strip().replace(" ", "") s = str(loan.xpath("dd[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("个", "") loan_obj.period_unit = s.split("/")[0].strip() loan_obj.repayment = s.split("/")[1].strip() loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 17 #url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512" s = int(time.time() * 1000) e = s + 1 url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d" url_1 = url % (0, s, e) request_headers = { 'Referee': "http://www.touna.cn/invest-list.html", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url_1, request_headers) htm_json = loads(htm, encoding="UTF-8") page_count = htm_json["result"]["pages"]["count"] page = page_count / 10 if page_count % 10 > 0: page += 1 if page > 0: for p in range(0, page): # 重新计算当前时间 s = int(time.time() * 1000) e = s + 1 page_url = url % (p, s, e) loan_htm = download_page(page_url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loans = loans_json["result"]["list"] for loan in loans: original_id = str(loan["id"]) if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["score"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int( original_id) loan_obj.title = loan["name"] loan_obj.borrow_amount = loan["account"] loan_obj.rate = loan["apr"] loan_obj.schedule = str(loan["score"]) loan_obj.repayment = loan["style_name"] period = str(loan["time_limit_name"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 15 url = "https://www.iqianbang.com/invest" request_headers = {"Referee": "https://www.iqianbang.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr") if len(loans) > 0: for loan in loans: if str(loan.xpath("td[7]/text()")[0].encode("utf-8")).strip() != "融资中": continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("-")[3].replace(".shtml", "") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.iqianbang.com" + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = ( str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace(",", "").replace("元", "") ) if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = ( str(loan.xpath("td[2]/span/span/text()")[0].encode("utf-8")).strip().replace("%", "") ) period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") # 这里需要进入详情页 loan_info_htm = download_page(loan_obj.href, headers={"Referee": url, "User-Agent": DEFAULT_UA}) loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8") loan_obj.repayment = str( loan_info_htm_parse.xpath("//div[@class='inright']/table[@class='idetable']")[0] .xpath("tr[2]/td[2]/span/text()")[0] .encode("utf-8") ).strip() loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 7 url = "http://www.jimubox.com/Project/List?status=1" request_headers = {'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='row']/div[@class='span3 project-card']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/@href")[0]) if not href.find("Index") > 0: continue original_id = href.split("/")[3].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.jimubox.com" + href loan_obj.title = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/text()")[0].encode("utf-8")) loan_obj.description = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\ .strip() + "0000" loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\ .strip() if rate.find("+") > 0: rate_list = rate.split("+") loan_obj.rate = str(float(rate_list[0]) + float(rate_list[1])) else: loan_obj.rate = rate loan_obj.repayment = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h6/span/text()")[0].encode("utf-8")) loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\ .strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 2 url = "http://www.ppdai.com/lend/12_s1_p1" request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='fen_ye_nav']/table/tr/td[last()]/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "http://www.ppdai.com/lend/12_s1_p" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath("//div[@class='lend_nav']/table/tr") if len(loans) > 0: for loan in loans: if lxml.html.tostring(loan).find("tit_nav") > 0: continue href = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@href")[0]) original_id = href.split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1] loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id) loan_obj.original_id = original_id loan_obj.href = "http://www.ppdai.com" + href loan_obj.title = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@title")[0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str(loan.xpath("td[4]/text()")[0]).strip().replace("%", "") period = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "") if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = float(str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1]) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 10 url = "https://www.xinhehui.com/Financing/Invest/ajaxplist" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//table[@class='ui-record-table percentTable mt10']/tbody/tr") if len(loans) > 0: for loan in loans: if loan.xpath("td[last()]/a/@href")[0].encode("utf-8") == "javascript:;": #放弃已经结束的 continue href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8")) original_id = href.split("id%3D")[1].encode("utf-8").strip() if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) if loan.xpath("td[7]/div/a"): loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.xinhehui.com" + href title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip() if loan.xpath("td[1]/p[1]/a/em"): title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip() else: title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip() loan_obj.title = title_1 + title_2 borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "") if borrow_amount.find("万") > 0: loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000 else: loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", "")) if loan.xpath("td[4]/span"): period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip() else: period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "") loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip() if loan.xpath("td[7]/div/a"): loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 26 url = "http://www.longlongweb.com/invests" request_headers = {"Referee": "http://www.longlongweb.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="utf-8") loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl") if len(loans) > 0: for loan in loans: if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0: continue href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0]) original_id = href.split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = ( str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.href = "http://www.longlongweb.com" + href loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = ( str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.longlongweb.com" + href loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8")) loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).replace( "%", "" ) loan_obj.borrow_amount = ( str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = ( str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.repayment = ( str( loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0].encode( "utf-8" ) ) .strip() .replace("还款方式:", "") ) loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 13 url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress=" request_headers = { 'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='bidList']/li") if len(loans) > 0: for loan in loans: if not loan.xpath( "div[2]/div[2]/div/div[@class='bid_empty_errortip']"): continue href = str(loan.xpath("div[2]/div/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str( float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.yirendai.com" + href loan_obj.title = str( loan.xpath("div[2]/div/h3/a/text()")[0].encode( "utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.rate = str( loan.xpath("div[2]/div/div[3]/h4/span/text()") [0].encode("utf-8")).strip() loan_obj.period = str( loan.xpath("div[2]/div/div[4]/h4/span/text()") [0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str( float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 3 url = "http://www.91wangcai.com/invest/index.html" request_headers = {'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="gb2312") loans = loan_htm_parse.xpath("//div[@class='proBoxNew']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='hd']/a/@href")[0]) original_id = href.split(".")[0].split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.91wangcai.com" + href loan_obj.title = autodecode(str(loan.xpath("div[@class='hd']/a/text()")[0].encode("gb2312"))).encode("utf-8") loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("¥", "") loan_obj.rate = str(loan.xpath("div[@class='bd']/table/tr[1]/td[2]/em/text()")[0]).strip().replace("%", "") loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \ .replace("<em>", "").replace("</em>", "") html_parser = HTMLParser.HTMLParser() period = html_parser.unescape(loan_period_text).encode("utf-8").strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("还款方式:", "") loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 20 url = "http://www.xiaomabank.com/finance.do" request_headers = {"Referee": "http://www.xiaomabank.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr") if len(loans) > 0: for loan in loans: href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = ( str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")) .replace("width:", "") .strip() .replace("%;", "") ) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.xiaomabank.com/" + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = ( str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "") ) loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = ( str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")) .replace("width:", "") .strip() .replace("%;", "") ) # 注意这里页面返回的gzip压缩后的,需要解压 resp = urllib2.urlopen(loan_obj.href) respInfo = resp.info() if ("Content-Encoding" in respInfo) and (respInfo["Content-Encoding"] == "gzip"): respHtml = zlib.decompress(resp.read(), 16 + zlib.MAX_WBITS) info_htm_parse = parse_html(respHtml, encoding="utf-8") loan_obj.repayment = str( info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8") ) loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 16 #url = "http://www.itouzi.com/dinvest/invest/index" url = "http://www.itouzi.com/dinvest/debt/index" request_headers = { 'Referee': "http://www.itouzi.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") # 注意ul的class后面有个空格 loans = loan_htm_parse.xpath( "//ul[@class='invest-product-case-list mtn btn clearfix ']/li") if len(loans) > 0: for loan in loans: if not loan.xpath( "div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']" ): continue href = str(loan.xpath("h2/a[@class='fl']/@href")[0]) original_id = href.split("id=")[1] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) #loan_obj = Loan(company_id, original_id) #loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") #loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.itouzi.com" + href loan_obj.title = str( loan.xpath("h2/a[@class='fl']/text()")[0].encode( "utf-8")).strip() loan_obj.repayment = str(loan.xpath("p/span[2]/text()")[0].encode("utf-8"))\ .strip().replace("还款方式:", "") loan_obj.borrow_amount = int( loan.xpath("p/span[3]/strong/text()")[0]) * 10000 loan_obj.rate = str( loan.xpath("p/span[5]/em[1]/text()")[0].encode( "utf-8")).strip().replace("%", "") period = str( loan.xpath("p/span[4]/strong/text()")[0].encode( "utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH # 这个进度这块还不确定,需等有标时检查一遍 if loan.xpath( "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']" ): loan_obj.schedule = str( loan.xpath( "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()" )[0].encode("utf-8")).strip().replace("%", "") print loan_obj.schedule #loan_obj.db_create(db) # # logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # ## db - 新抓取的 = 就是要下线的 #off_ids_set = db_ids_set - online_ids_set #if off_ids_set: # loan_obj = Loan(company_id) # loan_obj.db_offline(db, off_ids_set) # logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 14 url = "http://www.licaifan.com" request_headers = {'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath( "//ul[@class='main-list tab-con2']/li[1]/table/tr") if len(loans) > 0: # 这里注意第一行是表单标题,不需要,所以从1开始 for i in range(1, len(loans)): if str(loans[i].xpath("td[last()]/a/text()")[0].encode( "utf-8")) == "投资满额": continue href = str(loans[i].xpath("td[1]/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.licaifan.com" + href loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()") [0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\ .strip().replace(",", "") if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int( loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = str(loans[i].xpath("td[2]/text()") [0].encode("utf-8")).strip().replace( "%", "") period = str(loans[i].xpath("td[4]/text()")[0].encode( "utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 17 # url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512" s = int(time.time() * 1000) e = s + 1 url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d" url_1 = url % (0, s, e) request_headers = {"Referee": "http://www.touna.cn/invest-list.html", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url_1, request_headers) htm_json = loads(htm, encoding="UTF-8") page_count = htm_json["result"]["pages"]["count"] page = page_count / 10 if page_count % 10 > 0: page += 1 if page > 0: for p in range(0, page): # 重新计算当前时间 s = int(time.time() * 1000) e = s + 1 page_url = url % (p, s, e) loan_htm = download_page(page_url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loans = loans_json["result"]["list"] for loan in loans: original_id = str(loan["id"]) if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["score"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int(original_id) loan_obj.title = loan["name"] loan_obj.borrow_amount = loan["account"] loan_obj.rate = loan["apr"] loan_obj.schedule = str(loan["score"]) loan_obj.repayment = loan["style_name"] period = str(loan["time_limit_name"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 20 url = "http://www.xiaomabank.com/finance.do" request_headers = {'Referee': "http://www.xiaomabank.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr") if len(loans) > 0: for loan in loans: href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.xiaomabank.com/" + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "") loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "") # 注意这里页面返回的gzip压缩后的,需要解压 resp = urllib2.urlopen(loan_obj.href) respInfo = resp.info() if(("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")): respHtml = zlib.decompress(resp.read(), 16+zlib.MAX_WBITS) info_htm_parse = parse_html(respHtml, encoding="utf-8") loan_obj.repayment = str(info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8")) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 23 url = "https://member.niwodai.com/xiangmu/" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm, encoding="utf-8") loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\ .replace("共", "").replace("个标", "").strip()) if loan_size > 0: page = loan_size / 10 if loan_size % 10 > 0: page += 1 for p in range(1, page + 1): page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % ( p, loan_size) page_html = download_page(page_url, request_headers) page_obj = parse_html(page_html, encoding="utf-8") loans = page_obj.xpath( "//div[@class='biaoList']/table/tbody/tr") for loan in loans: if lxml.html.tostring(loan).find("<th>") > 0: continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.replace(".html", "").split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( loan.xpath("td[5]/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = REFEREE + href loan_obj.title = str( loan.xpath("td[1]/a/text()")[0].encode( "utf-8")).strip() loan_obj.borrow_amount = str( loan.xpath("td[4]/em/text()")[0].encode( "utf-8")).strip().replace(",", "") loan_obj.rate = str( loan.xpath("td[2]/em/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.period = str( loan.xpath("td[3]/em/text()")[0].encode( "utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY loan_obj.schedule = str( loan.xpath("td[5]/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 25 url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \ "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \ "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page((url % 1), request_headers) obj = loads(htm, encoding="utf-8") total = int(obj["total"]) if total > 0: page = total / 5 if total % 5 > 0: page += 1 for p in range(1, page+1): htm = download_page((url % p), request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["projectList"] for loan in loans: original_id = loan["ID"] href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["Title"] loan_obj.borrow_amount = loan["TotalAmount"] loan_obj.rate = loan["YearRate"] loan_obj.period = loan["Deadline"] loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.repayment = loan["RepaymentTypeDesc"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 9 url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1" request_headers = { 'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loan_num = loans_json["totalCount"] if loans_json and loan_num: for i in range(0, loan_num): original_id = str(loans_json["data"][i]["productId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str( int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id loan_obj.title = loans_json["data"][i][ "productNameDisplay"] loan_obj.rate = str( float(loans_json["data"][i]["interestRate"]) * 100) period = str(loans_json["data"][i] ["investPeriodDisplay"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = loans_json["data"][i][ "collectionModeDisplay"] loan_obj.borrow_amount = str( int(loans_json["data"][i]["price"])) loan_obj.schedule = str( float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str( int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 25 url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \ "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \ "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page((url % 1), request_headers) obj = loads(htm, encoding="utf-8") total = int(obj["total"]) if total > 0: page = total / 5 if total % 5 > 0: page += 1 for p in range(1, page + 1): htm = download_page((url % p), request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["projectList"] for loan in loans: original_id = loan["ID"] href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["Title"] loan_obj.borrow_amount = loan["TotalAmount"] loan_obj.rate = loan["YearRate"] loan_obj.period = loan["Deadline"] loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.repayment = loan["RepaymentTypeDesc"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 11 url = "https://www.tzydb.com" request_headers = {'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//div[@id='proList']/ul[@class='item_li']") if len(loans) > 0: for loan in loans: schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip() if schedule == "100%" or schedule == "100.0%": #放弃已经结束的 continue # link = https://www.tzydb.com/boot/lookup/971,1017 a_script = str(loan.xpath("li/div[1]/div[1]/div/a/@href")[0].encode("utf-8")) o_id = ID_RE.findall(a_script)[0] original_id = o_id.replace(",", "-") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.tzydb.com/boot/lookup/" + o_id loan_obj.title = str(loan.xpath("li/div[1]/div[1]/div/a/text()")[0].encode("utf-8")) loan_obj.borrow_amount = str(loan.xpath("li/div[2]/div[1]/span/text()")[0].encode("utf-8")).strip()\ .replace(" ", "").replace(",", "") loan_obj.period = str(loan.xpath("li/div[2]/div[3]/span/text()")[0].encode("UTF-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loan.xpath("li/div[2]/div[2]/span/text()")[0]).strip().replace("%", "") loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 13 url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress=" request_headers = {'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='bidList']/li") if len(loans) > 0: for loan in loans: if not loan.xpath("div[2]/div[2]/div/div[@class='bid_empty_errortip']"): continue href = str(loan.xpath("div[2]/div/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.yirendai.com" + href loan_obj.title = str(loan.xpath("div[2]/div/h3/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.rate = str(loan.xpath("div[2]/div/div[3]/h4/span/text()")[0].encode("utf-8")).strip() loan_obj.period = str(loan.xpath("div[2]/div/div[4]/h4/span/text()")[0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 16 # url = "http://www.itouzi.com/dinvest/invest/index" url = "http://www.itouzi.com/dinvest/debt/index" request_headers = {"Referee": "http://www.itouzi.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") # 注意ul的class后面有个空格 loans = loan_htm_parse.xpath("//ul[@class='invest-product-case-list mtn btn clearfix ']/li") if len(loans) > 0: for loan in loans: if not loan.xpath("div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']"): continue href = str(loan.xpath("h2/a[@class='fl']/@href")[0]) original_id = href.split("id=")[1] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) # loan_obj = Loan(company_id, original_id) # loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") # loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.itouzi.com" + href loan_obj.title = str(loan.xpath("h2/a[@class='fl']/text()")[0].encode("utf-8")).strip() loan_obj.repayment = ( str(loan.xpath("p/span[2]/text()")[0].encode("utf-8")).strip().replace("还款方式:", "") ) loan_obj.borrow_amount = int(loan.xpath("p/span[3]/strong/text()")[0]) * 10000 loan_obj.rate = ( str(loan.xpath("p/span[5]/em[1]/text()")[0].encode("utf-8")).strip().replace("%", "") ) period = str(loan.xpath("p/span[4]/strong/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH # 这个进度这块还不确定,需等有标时检查一遍 if loan.xpath("div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']"): loan_obj.schedule = ( str( loan.xpath( "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()" )[0].encode("utf-8") ) .strip() .replace("%", "") ) print loan_obj.schedule # loan_obj.db_create(db) # # logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # ## db - 新抓取的 = 就是要下线的 # off_ids_set = db_ids_set - online_ids_set # if off_ids_set: # loan_obj = Loan(company_id) # loan_obj.db_offline(db, off_ids_set) # logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 7 url = "http://www.jimubox.com/Project/List?status=1" request_headers = { 'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath( "//div[@class='row']/div[@class='span3 project-card']") if len(loans) > 0: for loan in loans: href = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h4/a/@href" )[0]) if not href.find("Index") > 0: continue original_id = href.split("/")[3].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.jimubox.com" + href loan_obj.title = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h4/a/text()" )[0].encode("utf-8")) loan_obj.description = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()" )[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\ .strip() + "0000" loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\ .strip() if rate.find("+") > 0: rate_list = rate.split("+") loan_obj.rate = str( float(rate_list[0]) + float(rate_list[1])) else: loan_obj.rate = rate loan_obj.repayment = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h6/span/text()" )[0].encode("utf-8")) loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\ .strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 3 url = "http://www.91wangcai.com/invest/index.html" request_headers = { 'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="gb2312") loans = loan_htm_parse.xpath("//div[@class='proBoxNew']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='hd']/a/@href")[0]) original_id = href.split(".")[0].split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.91wangcai.com" + href loan_obj.title = autodecode( str( loan.xpath("div[@class='hd']/a/text()")[0].encode( "gb2312"))).encode("utf-8") loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("¥", "") loan_obj.rate = str( loan.xpath( "div[@class='bd']/table/tr[1]/td[2]/em/text()") [0]).strip().replace("%", "") loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \ .replace("<em>", "").replace("</em>", "") html_parser = HTMLParser.HTMLParser() period = html_parser.unescape(loan_period_text).encode( "utf-8").strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("还款方式:", "") loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 12 url = "http://www.renrendai.com/lend/loanList.action" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans_script = htm_obj.xpath("//script[@id='loan-list-rsp']/text()")[0].encode("utf-8") loans_json = loads(loans_script, encoding="UTF-8") loan_size = len(loans_json["data"]["loans"]) if loan_size > 0: for i in range(0, loan_size): if loans_json["data"]["loans"][i]["status"] != "OPEN": #放弃已经结束的 continue original_id = str(int(loans_json["data"]["loans"][i]["loanId"])) href = "http://www.renrendai.com/lend/detailPage.action?loanId=%s" % original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0] loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"])) loan_obj.db_update(db) else: pass new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = str(loans_json["data"]["loans"][i]["title"].encode("utf-8")) loan_obj.borrow_amount = str(loans_json["data"]["loans"][i]["amount"]) loan_obj.period = str(int(loans_json["data"]["loans"][i]["months"])) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loans_json["data"]["loans"][i]["interest"]) loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"])) loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 10 url = "https://www.xinhehui.com/Financing/Invest/ajaxplist" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath( "//table[@class='ui-record-table percentTable mt10']/tbody/tr") if len(loans) > 0: for loan in loans: if loan.xpath("td[last()]/a/@href")[0].encode( "utf-8") == "javascript:;": #放弃已经结束的 continue href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8")) original_id = href.split("id%3D")[1].encode("utf-8").strip() if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) if loan.xpath("td[7]/div/a"): loan_obj.schedule = str( loan.xpath("td[7]/div/a/text()")[0].encode( "UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.xinhehui.com" + href title_1 = str( loan.xpath("td[1]/p[1]/a/text()")[0].encode( "utf-8")).strip() if loan.xpath("td[1]/p[1]/a/em"): title_2 = str( loan.xpath("td[1]/p[1]/a/em/text()")[0].encode( "utf-8")).strip() else: title_2 = str( loan.xpath("td[1]/p[1]/a/span/text()")[0].encode( "utf-8")).strip() loan_obj.title = title_1 + title_2 borrow_amount = str( loan.xpath("td[2]/span/text()")[0].encode( "utf-8")).strip().replace(" ", "") if borrow_amount.find("万") > 0: loan_obj.borrow_amount = float( borrow_amount.replace("万", "")) * 10000 else: loan_obj.borrow_amount = float( borrow_amount.replace("元", "").replace(",", "")) if loan.xpath("td[4]/span"): period = str( loan.xpath("td[4]/span/@title")[0].encode( "UTF-8")).strip() else: period = str( loan.xpath("td[4]/text()")[0].encode( "UTF-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str( loan.xpath("td[3]/p/text()")[0]).strip().replace( "%", "") loan_obj.repayment = str( loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip() if loan.xpath("td[7]/div/a"): loan_obj.schedule = str( loan.xpath("td[7]/div/a/text()")[0].encode( "UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 24 url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["list"] if len(loans) > 0: for loan in loans: if str(loan["bID_SCHD"]) == "100": #放弃已经结束的 continue original_id = loan["lN_NO"] href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["bID_SCHD"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["lN_NM"] loan_obj.borrow_amount = loan["lN_AMT"] loan_obj.rate = loan["lN_RATE"] loan_obj.period = loan["lN_TERM"] loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"] loan_obj.schedule = str(loan["bID_SCHD"]) loan_obj.repayment = loan["pAY_METH_DESC"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())