def crawl(): company_id = 9 url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1" request_headers = { 'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loan_num = loans_json["totalCount"] if loans_json and loan_num: for i in range(0, loan_num): original_id = str(loans_json["data"][i]["productId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str( int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id loan_obj.title = loans_json["data"][i][ "productNameDisplay"] loan_obj.rate = str( float(loans_json["data"][i]["interestRate"]) * 100) period = str(loans_json["data"][i] ["investPeriodDisplay"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = loans_json["data"][i][ "collectionModeDisplay"] loan_obj.borrow_amount = str( int(loans_json["data"][i]["price"])) loan_obj.schedule = str( float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str( int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 26 url = "http://www.longlongweb.com/invests" request_headers = {'Referee': "http://www.longlongweb.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="utf-8") loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl") if len(loans) > 0: for loan in loans: if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0: continue href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0]) original_id = href.split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\ .replace(",", "").replace("¥", "").strip() loan_obj.href = "http://www.longlongweb.com" + href loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0] .encode("utf-8")).replace(",", "").replace("¥", "").strip() loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.longlongweb.com" + href loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8")) loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8"))\ .replace("%", "") loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\ .replace(",", "").replace("¥", "").strip() loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0] .encode("utf-8")).replace(",", "").replace("¥", "").strip() loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.repayment = str(loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0] .encode("utf-8")).strip().replace("还款方式:", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 19 url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0" request_headers = { 'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") if loans_json["list"]: for i in range(0, len(loans_json["list"])): if int(loans_json["list"][i]["status"]) != 1: continue original_id = str(loans_json["list"][i]["borrowId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str( int(loans_json["list"][i]["accountYes"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id loan_obj.title = loans_json["list"][i]["name"] loan_obj.rate = str(loans_json["list"][i]["apr"]) loan_obj.period = str(loans_json["list"][i]["totalPeriod"]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.borrow_amount = str( int(loans_json["list"][i]["account"])) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str( int(loans_json["list"][i]["accountYes"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 7 url = "http://www.jimubox.com/Project/List?status=1" request_headers = { 'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath( "//div[@class='row']/div[@class='span3 project-card']") if len(loans) > 0: for loan in loans: href = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h4/a/@href" )[0]) if not href.find("Index") > 0: continue original_id = href.split("/")[3].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.jimubox.com" + href loan_obj.title = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h4/a/text()" )[0].encode("utf-8")) loan_obj.description = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()" )[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\ .strip() + "0000" loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\ .strip() if rate.find("+") > 0: rate_list = rate.split("+") loan_obj.rate = str( float(rate_list[0]) + float(rate_list[1])) else: loan_obj.rate = rate loan_obj.repayment = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h6/span/text()" )[0].encode("utf-8")) loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\ .strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 12 url = "http://www.renrendai.com/lend/loanList.action" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans_script = htm_obj.xpath("//script[@id='loan-list-rsp']/text()")[0].encode("utf-8") loans_json = loads(loans_script, encoding="UTF-8") loan_size = len(loans_json["data"]["loans"]) if loan_size > 0: for i in range(0, loan_size): if loans_json["data"]["loans"][i]["status"] != "OPEN": #放弃已经结束的 continue original_id = str(int(loans_json["data"]["loans"][i]["loanId"])) href = "http://www.renrendai.com/lend/detailPage.action?loanId=%s" % original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0] loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"])) loan_obj.db_update(db) else: pass new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = str(loans_json["data"]["loans"][i]["title"].encode("utf-8")) loan_obj.borrow_amount = str(loans_json["data"]["loans"][i]["amount"]) loan_obj.period = str(int(loans_json["data"]["loans"][i]["months"])) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loans_json["data"]["loans"][i]["interest"]) loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"])) loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 25 url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \ "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \ "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page((url % 1), request_headers) obj = loads(htm, encoding="utf-8") total = int(obj["total"]) if total > 0: page = total / 5 if total % 5 > 0: page += 1 for p in range(1, page + 1): htm = download_page((url % p), request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["projectList"] for loan in loans: original_id = loan["ID"] href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["Title"] loan_obj.borrow_amount = loan["TotalAmount"] loan_obj.rate = loan["YearRate"] loan_obj.period = loan["Deadline"] loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.repayment = loan["RepaymentTypeDesc"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 13 url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress=" request_headers = {'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='bidList']/li") if len(loans) > 0: for loan in loans: if not loan.xpath("div[2]/div[2]/div/div[@class='bid_empty_errortip']"): continue href = str(loan.xpath("div[2]/div/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.yirendai.com" + href loan_obj.title = str(loan.xpath("div[2]/div/h3/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.rate = str(loan.xpath("div[2]/div/div[3]/h4/span/text()")[0].encode("utf-8")).strip() loan_obj.period = str(loan.xpath("div[2]/div/div[4]/h4/span/text()")[0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 13 url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress=" request_headers = { 'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='bidList']/li") if len(loans) > 0: for loan in loans: if not loan.xpath( "div[2]/div[2]/div/div[@class='bid_empty_errortip']"): continue href = str(loan.xpath("div[2]/div/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str( float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.yirendai.com" + href loan_obj.title = str( loan.xpath("div[2]/div/h3/a/text()")[0].encode( "utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.rate = str( loan.xpath("div[2]/div/div[3]/h4/span/text()") [0].encode("utf-8")).strip() loan_obj.period = str( loan.xpath("div[2]/div/div[4]/h4/span/text()") [0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str( float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 25 url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \ "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \ "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page((url % 1), request_headers) obj = loads(htm, encoding="utf-8") total = int(obj["total"]) if total > 0: page = total / 5 if total % 5 > 0: page += 1 for p in range(1, page+1): htm = download_page((url % p), request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["projectList"] for loan in loans: original_id = loan["ID"] href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["Title"] loan_obj.borrow_amount = loan["TotalAmount"] loan_obj.rate = loan["YearRate"] loan_obj.period = loan["Deadline"] loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.repayment = loan["RepaymentTypeDesc"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 7 url = "http://www.jimubox.com/Project/List?status=1" request_headers = {'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='row']/div[@class='span3 project-card']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/@href")[0]) if not href.find("Index") > 0: continue original_id = href.split("/")[3].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.jimubox.com" + href loan_obj.title = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/text()")[0].encode("utf-8")) loan_obj.description = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\ .strip() + "0000" loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\ .strip() if rate.find("+") > 0: rate_list = rate.split("+") loan_obj.rate = str(float(rate_list[0]) + float(rate_list[1])) else: loan_obj.rate = rate loan_obj.repayment = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h6/span/text()")[0].encode("utf-8")) loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\ .strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 19 url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0" request_headers = {'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") if loans_json["list"]: for i in range(0, len(loans_json["list"])): if int(loans_json["list"][i]["status"]) != 1: continue original_id = str(loans_json["list"][i]["borrowId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str(int(loans_json["list"][i]["accountYes"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id loan_obj.title = loans_json["list"][i]["name"] loan_obj.rate = str(loans_json["list"][i]["apr"]) loan_obj.period = str(loans_json["list"][i]["totalPeriod"]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.borrow_amount = str(int(loans_json["list"][i]["account"])) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str(int(loans_json["list"][i]["accountYes"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 9 url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1" request_headers = {'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loan_num = loans_json["totalCount"] if loans_json and loan_num: for i in range(0, loan_num): original_id = str(loans_json["data"][i]["productId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id loan_obj.title = loans_json["data"][i]["productNameDisplay"] loan_obj.rate = str(float(loans_json["data"][i]["interestRate"]) * 100) period = str(loans_json["data"][i]["investPeriodDisplay"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = loans_json["data"][i]["collectionModeDisplay"] loan_obj.borrow_amount = str(int(loans_json["data"][i]["price"])) loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 22 url = "https://www.weidai.com.cn/?m=Biao&t=today&pageIndex=1&pageSize=8&sortField=b.verify_time&sortOrder=desc&data=null" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["data"] if len(loans) > 0: for loan in loans: if str(loan["borrow_account_scale"]) == "100.00": #放弃已经结束的 continue original_id = loan["uid"] href = "https://www.weidai.com.cn/page/borinfo.html?uid=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["borrow_account_scale"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["name"] loan_obj.borrow_amount = loan["account"] loan_obj.rate = loan["borrow_apr"] loan_obj.period = loan["borrow_period"] loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan["borrow_account_scale"]) loan_obj.cast = loan["borrow_account_yes"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 26 url = "http://www.longlongweb.com/invests" request_headers = {"Referee": "http://www.longlongweb.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="utf-8") loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl") if len(loans) > 0: for loan in loans: if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0: continue href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0]) original_id = href.split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = ( str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.href = "http://www.longlongweb.com" + href loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = ( str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.longlongweb.com" + href loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8")) loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).replace( "%", "" ) loan_obj.borrow_amount = ( str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = ( str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.repayment = ( str( loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0].encode( "utf-8" ) ) .strip() .replace("还款方式:", "") ) loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())