def main(): print ('========== Exersize 2.1.4 ==========\n') print('Testing Loan classmethod for monthly payment : {}'.format(Loan.calcMonthlyPmt(100000, .025, 360))) print('Testing Loan classmethod for balance: {}'.format(Loan.calcBal(100000, .025, 360, 60))) myLoan = Loan(360,.025,100000) print("Monthly Payment: {}".format(myLoan.monthlyPayment())) print("Balance after 60 periods: {}".format(myLoan.balance(60))) print("Interest due on period 60: {}".format(myLoan.interestDue(60))) print("Principal due on period 60: {}".format(myLoan.principlaDue(60))) print("The total payment should equal interest plus principal which is {}".format(myLoan.interestDue(5) + myLoan.principlaDue(5))) print("Total Interest paid is {}".format(myLoan.totalInterest())) print("Total Payment is {}".format(myLoan.totalPayments())) print("Total Interest paid is {}".format(myLoan.totalInterest())) """ The benefit of the cls level method is that it allows us to compute a payment or balance with out initiating an object """ print("Old rate {}".format(myLoan.rate)) print("Old term {}".format(myLoan.term)) print("Old face {}".format(myLoan.face)) myLoan.rate = .035 myLoan.term = 60 myLoan.face = 20000 print("New rate {}".format(myLoan.rate)) print("New term {}".format(myLoan.term)) print("New face {}".format(myLoan.face))
def main(): print ('========== Exersize 2.1.3 ==========\n') myLoan = Loan(360,.025,100000) print("Monthly Payment: {}".format(myLoan.monthlyPayment())) t = timer() t.start() print("Balance after 60 periods: {}".format(myLoan.balance(60))) t.end() t.start() print('Balance in period 60 computed recursivly {}'.format(myLoan.balanceRecursive(60, myLoan.face))) t.end() t.start() print("Interest due on period 60: {}".format(myLoan.interestDue(60))) t.end() t.start() print('Interest in period 60 computed recursivly {}'.format(myLoan.interestDueRecursive(60, myLoan.face))) t.end() t.start() print("Principal due on period 60: {}".format(myLoan.principlaDue(60))) t.end() t.start() print('Principal in period 60 computed recursivly {}'.format(myLoan.principalDueRecursive(60, myLoan.face))) t.end() """ On my system, in both instances the direct and recursive versions of the function run to fast to comeup with a time besides 0 However, I know that the recursive function is likely much slower """ print("The total payment should equal interest plus principal which is {}".format(myLoan.interestDue(5) + myLoan.principlaDue(5))) print("Total Interest paid is {}".format(myLoan.totalInterest())) print("Total Payment is {}".format(myLoan.totalPayments())) print("Total Interest paid is {}".format(myLoan.totalInterest())) print("Old rate {}".format(myLoan.rate)) print("Old term {}".format(myLoan.term)) print("Old face {}".format(myLoan.face)) myLoan.rate = .035 myLoan.term = 60 myLoan.face = 20000 print("New rate {}".format(myLoan.rate)) print("New term {}".format(myLoan.term)) print("New face {}".format(myLoan.face))
def main(): print('========== Exercise 2.1.2 ==========') myLoan = Loan(360, .025, 100000) print("Monthly Payment: {}".format(myLoan.monthlyPayment())) print("Balance after 360 periods: {}".format(myLoan.balance(360))) print("Interest due on period 360: {}".format(myLoan.interestDue(360))) print("Principal due on period 360: {}".format(myLoan.principlaDue(360))) print("The total payment should equal interest plus principal which is {}". format(myLoan.interestDue(5) + myLoan.principlaDue(5))) print("Total Interest paid is {}".format(myLoan.totalInterest())) print("Total Payment is {}".format(myLoan.totalPayments())) print("Old rate {}".format(myLoan.rate)) print("Old term {}".format(myLoan.term)) print("Old face {}".format(myLoan.face)) myLoan.rate = .035 myLoan.term = 60 myLoan.face = 20000 print("New rate {}".format(myLoan.rate)) print("New term {}".format(myLoan.term)) print("New face {}".format(myLoan.face))
def main(): print('========== Exersize 2.1.5 ==========\n') print('Testing static method for monthly rate : {}'.format( Loan.monthlyRate(.025))) print('Testing testing static method for annual rate: {}'.format( Loan.annualRate(.01))) """ The benefit of the static method is that neither the class nor the instance is passed in. This allows us to include functions that may be useful for the class, but do not directly rely on information in the class. From an organizational standpoint this makes it easier to organize the code. For example, the rate conversion is not logically part of a loan object, however the loan class is where it makes most sense to include it. """ myLoan = Loan(360, .025, 100000) print("Monthly Payment: {}".format(myLoan.monthlyPayment())) print("Balance after 60 periods: {}".format(myLoan.balance(60))) print("Interest due on period 60: {}".format(myLoan.interestDue(60))) print("Principal due on period 60: {}".format(myLoan.principlaDue(60))) print("The total payment should equal interest plus principal which is {}". format(myLoan.interestDue(5) + myLoan.principlaDue(5))) print("Total Interest paid is {}".format(myLoan.totalInterest())) print("Total Payment is {}".format(myLoan.totalPayments())) print("Total Interest paid is {}".format(myLoan.totalInterest())) print("Old rate {}".format(myLoan.rate)) print("Old term {}".format(myLoan.term)) print("Old face {}".format(myLoan.face)) myLoan.rate = .035 myLoan.term = 60 myLoan.face = 20000 print("New rate {}".format(myLoan.rate)) print("New term {}".format(myLoan.term)) print("New face {}".format(myLoan.face))
def crawl(): company_id = 9 url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1" request_headers = { 'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loan_num = loans_json["totalCount"] if loans_json and loan_num: for i in range(0, loan_num): original_id = str(loans_json["data"][i]["productId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str( int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id loan_obj.title = loans_json["data"][i][ "productNameDisplay"] loan_obj.rate = str( float(loans_json["data"][i]["interestRate"]) * 100) period = str(loans_json["data"][i] ["investPeriodDisplay"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = loans_json["data"][i][ "collectionModeDisplay"] loan_obj.borrow_amount = str( int(loans_json["data"][i]["price"])) loan_obj.schedule = str( float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str( int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 23 url = "https://member.niwodai.com/xiangmu/" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm, encoding="utf-8") loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\ .replace("共", "").replace("个标", "").strip()) if loan_size > 0: page = loan_size / 10 if loan_size % 10 > 0: page += 1 for p in range(1, page + 1): page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % ( p, loan_size) page_html = download_page(page_url, request_headers) page_obj = parse_html(page_html, encoding="utf-8") loans = page_obj.xpath( "//div[@class='biaoList']/table/tbody/tr") for loan in loans: if lxml.html.tostring(loan).find("<th>") > 0: continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.replace(".html", "").split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( loan.xpath("td[5]/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = REFEREE + href loan_obj.title = str( loan.xpath("td[1]/a/text()")[0].encode( "utf-8")).strip() loan_obj.borrow_amount = str( loan.xpath("td[4]/em/text()")[0].encode( "utf-8")).strip().replace(",", "") loan_obj.rate = str( loan.xpath("td[2]/em/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.period = str( loan.xpath("td[3]/em/text()")[0].encode( "utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY loan_obj.schedule = str( loan.xpath("td[5]/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 17 # url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512" s = int(time.time() * 1000) e = s + 1 url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d" url_1 = url % (0, s, e) request_headers = {"Referee": "http://www.touna.cn/invest-list.html", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url_1, request_headers) htm_json = loads(htm, encoding="UTF-8") page_count = htm_json["result"]["pages"]["count"] page = page_count / 10 if page_count % 10 > 0: page += 1 if page > 0: for p in range(0, page): # 重新计算当前时间 s = int(time.time() * 1000) e = s + 1 page_url = url % (p, s, e) loan_htm = download_page(page_url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loans = loans_json["result"]["list"] for loan in loans: original_id = str(loan["id"]) if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["score"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int(original_id) loan_obj.title = loan["name"] loan_obj.borrow_amount = loan["account"] loan_obj.rate = loan["apr"] loan_obj.schedule = str(loan["score"]) loan_obj.repayment = loan["style_name"] period = str(loan["time_limit_name"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 16 #url = "http://www.itouzi.com/dinvest/invest/index" url = "http://www.itouzi.com/dinvest/debt/index" request_headers = { 'Referee': "http://www.itouzi.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") # 注意ul的class后面有个空格 loans = loan_htm_parse.xpath( "//ul[@class='invest-product-case-list mtn btn clearfix ']/li") if len(loans) > 0: for loan in loans: if not loan.xpath( "div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']" ): continue href = str(loan.xpath("h2/a[@class='fl']/@href")[0]) original_id = href.split("id=")[1] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) #loan_obj = Loan(company_id, original_id) #loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") #loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.itouzi.com" + href loan_obj.title = str( loan.xpath("h2/a[@class='fl']/text()")[0].encode( "utf-8")).strip() loan_obj.repayment = str(loan.xpath("p/span[2]/text()")[0].encode("utf-8"))\ .strip().replace("还款方式:", "") loan_obj.borrow_amount = int( loan.xpath("p/span[3]/strong/text()")[0]) * 10000 loan_obj.rate = str( loan.xpath("p/span[5]/em[1]/text()")[0].encode( "utf-8")).strip().replace("%", "") period = str( loan.xpath("p/span[4]/strong/text()")[0].encode( "utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH # 这个进度这块还不确定,需等有标时检查一遍 if loan.xpath( "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']" ): loan_obj.schedule = str( loan.xpath( "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()" )[0].encode("utf-8")).strip().replace("%", "") print loan_obj.schedule #loan_obj.db_create(db) # # logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # ## db - 新抓取的 = 就是要下线的 #off_ids_set = db_ids_set - online_ids_set #if off_ids_set: # loan_obj = Loan(company_id) # loan_obj.db_offline(db, off_ids_set) # logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 3 url = "http://www.91wangcai.com/invest/index.html" request_headers = {'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="gb2312") loans = loan_htm_parse.xpath("//div[@class='proBoxNew']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='hd']/a/@href")[0]) original_id = href.split(".")[0].split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.91wangcai.com" + href loan_obj.title = autodecode(str(loan.xpath("div[@class='hd']/a/text()")[0].encode("gb2312"))).encode("utf-8") loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("¥", "") loan_obj.rate = str(loan.xpath("div[@class='bd']/table/tr[1]/td[2]/em/text()")[0]).strip().replace("%", "") loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \ .replace("<em>", "").replace("</em>", "") html_parser = HTMLParser.HTMLParser() period = html_parser.unescape(loan_period_text).encode("utf-8").strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("还款方式:", "") loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 12 url = "http://www.renrendai.com/lend/loanList.action" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans_script = htm_obj.xpath("//script[@id='loan-list-rsp']/text()")[0].encode("utf-8") loans_json = loads(loans_script, encoding="UTF-8") loan_size = len(loans_json["data"]["loans"]) if loan_size > 0: for i in range(0, loan_size): if loans_json["data"]["loans"][i]["status"] != "OPEN": #放弃已经结束的 continue original_id = str(int(loans_json["data"]["loans"][i]["loanId"])) href = "http://www.renrendai.com/lend/detailPage.action?loanId=%s" % original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0] loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"])) loan_obj.db_update(db) else: pass new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = str(loans_json["data"]["loans"][i]["title"].encode("utf-8")) loan_obj.borrow_amount = str(loans_json["data"]["loans"][i]["amount"]) loan_obj.period = str(int(loans_json["data"]["loans"][i]["months"])) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loans_json["data"]["loans"][i]["interest"]) loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"])) loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 22 url = "https://www.weidai.com.cn/?m=Biao&t=today&pageIndex=1&pageSize=8&sortField=b.verify_time&sortOrder=desc&data=null" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["data"] if len(loans) > 0: for loan in loans: if str(loan["borrow_account_scale"]) == "100.00": #放弃已经结束的 continue original_id = loan["uid"] href = "https://www.weidai.com.cn/page/borinfo.html?uid=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["borrow_account_scale"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["name"] loan_obj.borrow_amount = loan["account"] loan_obj.rate = loan["borrow_apr"] loan_obj.period = loan["borrow_period"] loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan["borrow_account_scale"]) loan_obj.cast = loan["borrow_account_yes"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 7 url = "http://www.jimubox.com/Project/List?status=1" request_headers = {'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='row']/div[@class='span3 project-card']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/@href")[0]) if not href.find("Index") > 0: continue original_id = href.split("/")[3].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.jimubox.com" + href loan_obj.title = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/text()")[0].encode("utf-8")) loan_obj.description = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\ .strip() + "0000" loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\ .strip() if rate.find("+") > 0: rate_list = rate.split("+") loan_obj.rate = str(float(rate_list[0]) + float(rate_list[1])) else: loan_obj.rate = rate loan_obj.repayment = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h6/span/text()")[0].encode("utf-8")) loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\ .strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 19 url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0" request_headers = {'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") if loans_json["list"]: for i in range(0, len(loans_json["list"])): if int(loans_json["list"][i]["status"]) != 1: continue original_id = str(loans_json["list"][i]["borrowId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str(int(loans_json["list"][i]["accountYes"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id loan_obj.title = loans_json["list"][i]["name"] loan_obj.rate = str(loans_json["list"][i]["apr"]) loan_obj.period = str(loans_json["list"][i]["totalPeriod"]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.borrow_amount = str(int(loans_json["list"][i]["account"])) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str(int(loans_json["list"][i]["accountYes"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 23 url = "https://member.niwodai.com/xiangmu/" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm, encoding="utf-8") loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\ .replace("共", "").replace("个标", "").strip()) if loan_size > 0: page = loan_size / 10 if loan_size % 10 > 0: page += 1 for p in range(1, page+1): page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % (p, loan_size) page_html = download_page(page_url, request_headers) page_obj = parse_html(page_html, encoding="utf-8") loans = page_obj.xpath("//div[@class='biaoList']/table/tbody/tr") for loan in loans: if lxml.html.tostring(loan).find("<th>") > 0: continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.replace(".html", "").split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = REFEREE + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[4]/em/text()")[0].encode("utf-8")).strip().replace(",", "") loan_obj.rate = str(loan.xpath("td[2]/em/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("td[3]/em/text()")[0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 9 url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1" request_headers = {'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loan_num = loans_json["totalCount"] if loans_json and loan_num: for i in range(0, loan_num): original_id = str(loans_json["data"][i]["productId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id loan_obj.title = loans_json["data"][i]["productNameDisplay"] loan_obj.rate = str(float(loans_json["data"][i]["interestRate"]) * 100) period = str(loans_json["data"][i]["investPeriodDisplay"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = loans_json["data"][i]["collectionModeDisplay"] loan_obj.borrow_amount = str(int(loans_json["data"][i]["price"])) loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 24 url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["list"] if len(loans) > 0: for loan in loans: if str(loan["bID_SCHD"]) == "100": #放弃已经结束的 continue original_id = loan["lN_NO"] href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["bID_SCHD"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["lN_NM"] loan_obj.borrow_amount = loan["lN_AMT"] loan_obj.rate = loan["lN_RATE"] loan_obj.period = loan["lN_TERM"] loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"] loan_obj.schedule = str(loan["bID_SCHD"]) loan_obj.repayment = loan["pAY_METH_DESC"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 17 #url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512" s = int(time.time() * 1000) e = s + 1 url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d" url_1 = url % (0, s, e) request_headers = { 'Referee': "http://www.touna.cn/invest-list.html", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url_1, request_headers) htm_json = loads(htm, encoding="UTF-8") page_count = htm_json["result"]["pages"]["count"] page = page_count / 10 if page_count % 10 > 0: page += 1 if page > 0: for p in range(0, page): # 重新计算当前时间 s = int(time.time() * 1000) e = s + 1 page_url = url % (p, s, e) loan_htm = download_page(page_url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loans = loans_json["result"]["list"] for loan in loans: original_id = str(loan["id"]) if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["score"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int( original_id) loan_obj.title = loan["name"] loan_obj.borrow_amount = loan["account"] loan_obj.rate = loan["apr"] loan_obj.schedule = str(loan["score"]) loan_obj.repayment = loan["style_name"] period = str(loan["time_limit_name"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 10 url = "https://www.xinhehui.com/Financing/Invest/ajaxplist" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//table[@class='ui-record-table percentTable mt10']/tbody/tr") if len(loans) > 0: for loan in loans: if loan.xpath("td[last()]/a/@href")[0].encode("utf-8") == "javascript:;": #放弃已经结束的 continue href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8")) original_id = href.split("id%3D")[1].encode("utf-8").strip() if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) if loan.xpath("td[7]/div/a"): loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.xinhehui.com" + href title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip() if loan.xpath("td[1]/p[1]/a/em"): title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip() else: title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip() loan_obj.title = title_1 + title_2 borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "") if borrow_amount.find("万") > 0: loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000 else: loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", "")) if loan.xpath("td[4]/span"): period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip() else: period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "") loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip() if loan.xpath("td[7]/div/a"): loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 15 url = "https://www.iqianbang.com/invest" request_headers = {"Referee": "https://www.iqianbang.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr") if len(loans) > 0: for loan in loans: if str(loan.xpath("td[7]/text()")[0].encode("utf-8")).strip() != "融资中": continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("-")[3].replace(".shtml", "") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.iqianbang.com" + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = ( str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace(",", "").replace("元", "") ) if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = ( str(loan.xpath("td[2]/span/span/text()")[0].encode("utf-8")).strip().replace("%", "") ) period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") # 这里需要进入详情页 loan_info_htm = download_page(loan_obj.href, headers={"Referee": url, "User-Agent": DEFAULT_UA}) loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8") loan_obj.repayment = str( loan_info_htm_parse.xpath("//div[@class='inright']/table[@class='idetable']")[0] .xpath("tr[2]/td[2]/span/text()")[0] .encode("utf-8") ).strip() loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 13 url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress=" request_headers = { 'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='bidList']/li") if len(loans) > 0: for loan in loans: if not loan.xpath( "div[2]/div[2]/div/div[@class='bid_empty_errortip']"): continue href = str(loan.xpath("div[2]/div/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str( float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.yirendai.com" + href loan_obj.title = str( loan.xpath("div[2]/div/h3/a/text()")[0].encode( "utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.rate = str( loan.xpath("div[2]/div/div[3]/h4/span/text()") [0].encode("utf-8")).strip() loan_obj.period = str( loan.xpath("div[2]/div/div[4]/h4/span/text()") [0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str( float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 26 url = "http://www.longlongweb.com/invests" request_headers = {"Referee": "http://www.longlongweb.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="utf-8") loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl") if len(loans) > 0: for loan in loans: if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0: continue href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0]) original_id = href.split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = ( str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.href = "http://www.longlongweb.com" + href loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = ( str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.longlongweb.com" + href loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8")) loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).replace( "%", "" ) loan_obj.borrow_amount = ( str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = ( str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.repayment = ( str( loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0].encode( "utf-8" ) ) .strip() .replace("还款方式:", "") ) loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 20 url = "http://www.xiaomabank.com/finance.do" request_headers = {"Referee": "http://www.xiaomabank.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr") if len(loans) > 0: for loan in loans: href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = ( str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")) .replace("width:", "") .strip() .replace("%;", "") ) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.xiaomabank.com/" + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = ( str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "") ) loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = ( str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")) .replace("width:", "") .strip() .replace("%;", "") ) # 注意这里页面返回的gzip压缩后的,需要解压 resp = urllib2.urlopen(loan_obj.href) respInfo = resp.info() if ("Content-Encoding" in respInfo) and (respInfo["Content-Encoding"] == "gzip"): respHtml = zlib.decompress(resp.read(), 16 + zlib.MAX_WBITS) info_htm_parse = parse_html(respHtml, encoding="utf-8") loan_obj.repayment = str( info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8") ) loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 21 url = "http://www.id68.cn/invest/index/borrow_status/9/p/1.html" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//ul[@class='ideal_con']/li") if len(loans) > 0: for loan in loans: if str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]) == "100.00%": #放弃已经结束的 continue href = str(loan.xpath("table/tr[1]/td[1]/a/@href")[0].encode("utf-8")) original_id = href.replace(".html", "").split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = REFEREE + href loan_obj.title = str(loan.xpath("table/tr[1]/td[1]/a/@title")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("table/tr[2]/td[last()]/span/text()")[0].encode("utf-8"))\ .strip().replace(" ", "").replace(",", "") loan_obj.repayment = str(loan.xpath("table/tr[2]/td[4]/span/text()")[0].encode("utf-8")).strip() loan_obj.rate = str(loan.xpath("table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("table/tr[2]/td[3]/span/text()")[0].encode("utf-8")).strip()\ .replace(" ", "").replace("个月", "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 14 url = "http://www.licaifan.com" request_headers = {'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath( "//ul[@class='main-list tab-con2']/li[1]/table/tr") if len(loans) > 0: # 这里注意第一行是表单标题,不需要,所以从1开始 for i in range(1, len(loans)): if str(loans[i].xpath("td[last()]/a/text()")[0].encode( "utf-8")) == "投资满额": continue href = str(loans[i].xpath("td[1]/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.licaifan.com" + href loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()") [0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\ .strip().replace(",", "") if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int( loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = str(loans[i].xpath("td[2]/text()") [0].encode("utf-8")).strip().replace( "%", "") period = str(loans[i].xpath("td[4]/text()")[0].encode( "utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 26 url = "http://www.longlongweb.com/invests" request_headers = {'Referee': "http://www.longlongweb.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="utf-8") loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl") if len(loans) > 0: for loan in loans: if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0: continue href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0]) original_id = href.split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\ .replace(",", "").replace("¥", "").strip() loan_obj.href = "http://www.longlongweb.com" + href loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0] .encode("utf-8")).replace(",", "").replace("¥", "").strip() loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.longlongweb.com" + href loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8")) loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8"))\ .replace("%", "") loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\ .replace(",", "").replace("¥", "").strip() loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0] .encode("utf-8")).replace(",", "").replace("¥", "").strip() loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.repayment = str(loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0] .encode("utf-8")).strip().replace("还款方式:", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 20 url = "http://www.xiaomabank.com/finance.do" request_headers = {'Referee': "http://www.xiaomabank.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr") if len(loans) > 0: for loan in loans: href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.xiaomabank.com/" + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "") loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "") # 注意这里页面返回的gzip压缩后的,需要解压 resp = urllib2.urlopen(loan_obj.href) respInfo = resp.info() if(("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")): respHtml = zlib.decompress(resp.read(), 16+zlib.MAX_WBITS) info_htm_parse = parse_html(respHtml, encoding="utf-8") loan_obj.repayment = str(info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8")) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 15 url = "https://www.iqianbang.com/invest" request_headers = { 'Referee': "https://www.iqianbang.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr") if len(loans) > 0: for loan in loans: if str(loan.xpath("td[7]/text()")[0].encode( "utf-8")).strip() != "融资中": continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("-")[3].replace(".shtml", "") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( loan.xpath("td[6]/text()")[0].encode( "utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.iqianbang.com" + href loan_obj.title = str( loan.xpath("td[1]/a/text()")[0].encode( "utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("utf-8"))\ .strip().replace(",", "").replace("元", "") if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int( loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = str( loan.xpath("td[2]/span/span/text()")[0].encode( "utf-8")).strip().replace("%", "") period = str( loan.xpath("td[4]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str( loan.xpath("td[6]/text()")[0].encode( "utf-8")).strip().replace("%", "") # 这里需要进入详情页 loan_info_htm = download_page(loan_obj.href, headers={ 'Referee': url, 'User-Agent': DEFAULT_UA }) loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8") loan_obj.repayment = str( loan_info_htm_parse.xpath( "//div[@class='inright']/table[@class='idetable']") [0].xpath("tr[2]/td[2]/span/text()")[0].encode( "utf-8")).strip() loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 25 url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \ "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \ "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page((url % 1), request_headers) obj = loads(htm, encoding="utf-8") total = int(obj["total"]) if total > 0: page = total / 5 if total % 5 > 0: page += 1 for p in range(1, page+1): htm = download_page((url % p), request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["projectList"] for loan in loans: original_id = loan["ID"] href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["Title"] loan_obj.borrow_amount = loan["TotalAmount"] loan_obj.rate = loan["YearRate"] loan_obj.period = loan["Deadline"] loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.repayment = loan["RepaymentTypeDesc"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 4 url = "https://www.yinhu.com/loan/loan_list.bl" request_headers = {'Referee': "https://www.yinhu.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() # offline off_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@id='loan_list']/table/tbody/tr") if len(loans) > 0: for loan in loans: href = str(loan.xpath("td[1]/p/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") try: loan_status = str(loan.xpath("td[last()]/em/span/text()")[0].encode("utf-8")).strip() except: loan_status = str(loan.xpath("td[last()]/a/span/text()")[0].encode("utf-8")).strip() if original_id and loan_status != "还款中": online_ids_set.add(original_id) if loan_status == "还款中" or loan_status == "满标": if original_id in db_ids_set: off_ids_set.add(original_id) continue if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\ .strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.yinhu.com" + href loan_obj.title = str(loan.xpath("td[1]/p/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip().replace(",", "")\ .replace("元", "") loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip() period = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\ .strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 25 url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \ "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \ "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page((url % 1), request_headers) obj = loads(htm, encoding="utf-8") total = int(obj["total"]) if total > 0: page = total / 5 if total % 5 > 0: page += 1 for p in range(1, page + 1): htm = download_page((url % p), request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["projectList"] for loan in loans: original_id = loan["ID"] href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["Title"] loan_obj.borrow_amount = loan["TotalAmount"] loan_obj.rate = loan["YearRate"] loan_obj.period = loan["Deadline"] loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.repayment = loan["RepaymentTypeDesc"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 19 url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0" request_headers = { 'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") if loans_json["list"]: for i in range(0, len(loans_json["list"])): if int(loans_json["list"][i]["status"]) != 1: continue original_id = str(loans_json["list"][i]["borrowId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str( int(loans_json["list"][i]["accountYes"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id loan_obj.title = loans_json["list"][i]["name"] loan_obj.rate = str(loans_json["list"][i]["apr"]) loan_obj.period = str(loans_json["list"][i]["totalPeriod"]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.borrow_amount = str( int(loans_json["list"][i]["account"])) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str( int(loans_json["list"][i]["accountYes"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 13 url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress=" request_headers = {'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='bidList']/li") if len(loans) > 0: for loan in loans: if not loan.xpath("div[2]/div[2]/div/div[@class='bid_empty_errortip']"): continue href = str(loan.xpath("div[2]/div/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.yirendai.com" + href loan_obj.title = str(loan.xpath("div[2]/div/h3/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.rate = str(loan.xpath("div[2]/div/div[3]/h4/span/text()")[0].encode("utf-8")).strip() loan_obj.period = str(loan.xpath("div[2]/div/div[4]/h4/span/text()")[0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 14 url = "http://www.licaifan.com" request_headers = {'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='main-list tab-con2']/li[1]/table/tr") if len(loans) > 0: # 这里注意第一行是表单标题,不需要,所以从1开始 for i in range(1, len(loans)): if str(loans[i].xpath("td[last()]/a/text()")[0].encode("utf-8")) == "投资满额": continue href = str(loans[i].xpath("td[1]/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.licaifan.com" + href loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\ .strip().replace(",", "") if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = str(loans[i].xpath("td[2]/text()")[0].encode("utf-8")).strip().replace("%", "") period = str(loans[i].xpath("td[4]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 2 url = "http://www.ppdai.com/lend/12_s1_p1" request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='fen_ye_nav']/table/tr/td[last()]/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "http://www.ppdai.com/lend/12_s1_p" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath("//div[@class='lend_nav']/table/tr") if len(loans) > 0: for loan in loans: if lxml.html.tostring(loan).find("tit_nav") > 0: continue href = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@href")[0]) original_id = href.split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1] loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id) loan_obj.original_id = original_id loan_obj.href = "http://www.ppdai.com" + href loan_obj.title = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@title")[0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str(loan.xpath("td[4]/text()")[0]).strip().replace("%", "") period = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "") if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = float(str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1]) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 11 url = "https://www.tzydb.com" request_headers = {'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//div[@id='proList']/ul[@class='item_li']") if len(loans) > 0: for loan in loans: schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip() if schedule == "100%" or schedule == "100.0%": #放弃已经结束的 continue # link = https://www.tzydb.com/boot/lookup/971,1017 a_script = str(loan.xpath("li/div[1]/div[1]/div/a/@href")[0].encode("utf-8")) o_id = ID_RE.findall(a_script)[0] original_id = o_id.replace(",", "-") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.tzydb.com/boot/lookup/" + o_id loan_obj.title = str(loan.xpath("li/div[1]/div[1]/div/a/text()")[0].encode("utf-8")) loan_obj.borrow_amount = str(loan.xpath("li/div[2]/div[1]/span/text()")[0].encode("utf-8")).strip()\ .replace(" ", "").replace(",", "") loan_obj.period = str(loan.xpath("li/div[2]/div[3]/span/text()")[0].encode("UTF-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loan.xpath("li/div[2]/div[2]/span/text()")[0]).strip().replace("%", "") loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 18 url = "https://www.my089.com/Loan/default.aspx?pid=1" request_headers = { 'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath( "//div[@class='Loan_box']/dl[@class='LoanList']") if len(loans) > 0: for loan in loans: if str(loan.xpath("dd[last()]/p/span/text()") [0]) == "100%": continue href = str( loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( loan.xpath("dd[last()]/p/span/text()")[0].encode( "UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.my089.com/Loan/" + href loan_obj.title = str( loan.xpath("dd[2]/div[@class='txt_tou']/a/@title") [0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str( loan.xpath("dd[3]/span/text()")[0].encode( "UTF-8")).strip().replace("%/年", "") loan_obj.period = str( loan.xpath("dd[5]/span/text()")[0].encode( "UTF-8")).strip().replace(" ", "") s = str(loan.xpath("dd[5]/text()")[0].encode( "UTF-8")).strip().replace(" ", "").replace("个", "") loan_obj.period_unit = s.split("/")[0].strip() loan_obj.repayment = s.split("/")[1].strip() loan_obj.schedule = str( loan.xpath("dd[last()]/p/span/text()")[0].encode( "UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 16 # url = "http://www.itouzi.com/dinvest/invest/index" url = "http://www.itouzi.com/dinvest/debt/index" request_headers = {"Referee": "http://www.itouzi.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") # 注意ul的class后面有个空格 loans = loan_htm_parse.xpath("//ul[@class='invest-product-case-list mtn btn clearfix ']/li") if len(loans) > 0: for loan in loans: if not loan.xpath("div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']"): continue href = str(loan.xpath("h2/a[@class='fl']/@href")[0]) original_id = href.split("id=")[1] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) # loan_obj = Loan(company_id, original_id) # loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") # loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.itouzi.com" + href loan_obj.title = str(loan.xpath("h2/a[@class='fl']/text()")[0].encode("utf-8")).strip() loan_obj.repayment = ( str(loan.xpath("p/span[2]/text()")[0].encode("utf-8")).strip().replace("还款方式:", "") ) loan_obj.borrow_amount = int(loan.xpath("p/span[3]/strong/text()")[0]) * 10000 loan_obj.rate = ( str(loan.xpath("p/span[5]/em[1]/text()")[0].encode("utf-8")).strip().replace("%", "") ) period = str(loan.xpath("p/span[4]/strong/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH # 这个进度这块还不确定,需等有标时检查一遍 if loan.xpath("div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']"): loan_obj.schedule = ( str( loan.xpath( "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()" )[0].encode("utf-8") ) .strip() .replace("%", "") ) print loan_obj.schedule # loan_obj.db_create(db) # # logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # ## db - 新抓取的 = 就是要下线的 # off_ids_set = db_ids_set - online_ids_set # if off_ids_set: # loan_obj = Loan(company_id) # loan_obj.db_offline(db, off_ids_set) # logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 18 url = "https://www.my089.com/Loan/default.aspx?pid=1" request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath("//div[@class='Loan_box']/dl[@class='LoanList']") if len(loans) > 0: for loan in loans: if str(loan.xpath("dd[last()]/p/span/text()")[0]) == "100%": continue href = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.my089.com/Loan/" + href loan_obj.title = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")[0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str(loan.xpath("dd[3]/span/text()")[0].encode("UTF-8")).strip().replace("%/年", "") loan_obj.period = str(loan.xpath("dd[5]/span/text()")[0].encode("UTF-8")).strip().replace(" ", "") s = str(loan.xpath("dd[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("个", "") loan_obj.period_unit = s.split("/")[0].strip() loan_obj.repayment = s.split("/")[1].strip() loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 3 url = "http://www.91wangcai.com/invest/index.html" request_headers = { 'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="gb2312") loans = loan_htm_parse.xpath("//div[@class='proBoxNew']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='hd']/a/@href")[0]) original_id = href.split(".")[0].split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.91wangcai.com" + href loan_obj.title = autodecode( str( loan.xpath("div[@class='hd']/a/text()")[0].encode( "gb2312"))).encode("utf-8") loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("¥", "") loan_obj.rate = str( loan.xpath( "div[@class='bd']/table/tr[1]/td[2]/em/text()") [0]).strip().replace("%", "") loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \ .replace("<em>", "").replace("</em>", "") html_parser = HTMLParser.HTMLParser() period = html_parser.unescape(loan_period_text).encode( "utf-8").strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("还款方式:", "") loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 7 url = "http://www.jimubox.com/Project/List?status=1" request_headers = { 'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath( "//div[@class='row']/div[@class='span3 project-card']") if len(loans) > 0: for loan in loans: href = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h4/a/@href" )[0]) if not href.find("Index") > 0: continue original_id = href.split("/")[3].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.jimubox.com" + href loan_obj.title = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h4/a/text()" )[0].encode("utf-8")) loan_obj.description = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()" )[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\ .strip() + "0000" loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\ .strip() if rate.find("+") > 0: rate_list = rate.split("+") loan_obj.rate = str( float(rate_list[0]) + float(rate_list[1])) else: loan_obj.rate = rate loan_obj.repayment = str( loan.xpath( "div[@class='project-item']/div[@class='project-item-content']/h6/span/text()" )[0].encode("utf-8")) loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\ .strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 10 url = "https://www.xinhehui.com/Financing/Invest/ajaxplist" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath( "//table[@class='ui-record-table percentTable mt10']/tbody/tr") if len(loans) > 0: for loan in loans: if loan.xpath("td[last()]/a/@href")[0].encode( "utf-8") == "javascript:;": #放弃已经结束的 continue href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8")) original_id = href.split("id%3D")[1].encode("utf-8").strip() if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) if loan.xpath("td[7]/div/a"): loan_obj.schedule = str( loan.xpath("td[7]/div/a/text()")[0].encode( "UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.xinhehui.com" + href title_1 = str( loan.xpath("td[1]/p[1]/a/text()")[0].encode( "utf-8")).strip() if loan.xpath("td[1]/p[1]/a/em"): title_2 = str( loan.xpath("td[1]/p[1]/a/em/text()")[0].encode( "utf-8")).strip() else: title_2 = str( loan.xpath("td[1]/p[1]/a/span/text()")[0].encode( "utf-8")).strip() loan_obj.title = title_1 + title_2 borrow_amount = str( loan.xpath("td[2]/span/text()")[0].encode( "utf-8")).strip().replace(" ", "") if borrow_amount.find("万") > 0: loan_obj.borrow_amount = float( borrow_amount.replace("万", "")) * 10000 else: loan_obj.borrow_amount = float( borrow_amount.replace("元", "").replace(",", "")) if loan.xpath("td[4]/span"): period = str( loan.xpath("td[4]/span/@title")[0].encode( "UTF-8")).strip() else: period = str( loan.xpath("td[4]/text()")[0].encode( "UTF-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str( loan.xpath("td[3]/p/text()")[0]).strip().replace( "%", "") loan_obj.repayment = str( loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip() if loan.xpath("td[7]/div/a"): loan_obj.schedule = str( loan.xpath("td[7]/div/a/text()")[0].encode( "UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 24 url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["list"] if len(loans) > 0: for loan in loans: if str(loan["bID_SCHD"]) == "100": #放弃已经结束的 continue original_id = loan["lN_NO"] href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["bID_SCHD"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["lN_NM"] loan_obj.borrow_amount = loan["lN_AMT"] loan_obj.rate = loan["lN_RATE"] loan_obj.period = loan["lN_TERM"] loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"] loan_obj.schedule = str(loan["bID_SCHD"]) loan_obj.repayment = loan["pAY_METH_DESC"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())