def main(args): db = Database() if args['initialize']: samples = get_input() for sample in samples: loan = Loan(sample) loan.generatePaymentSchedule(months=3) print("Finished populating db!") elif args['debits']: debits = db.queryLoanDebits(args['debits']) if len(debits) == 0: print(f"No debits found for loan {args['debits']}") else: for debit in debits: debObj = Debit(*list(debit)) pprint(vars(debObj)) elif args['payments']: payments = db.queryLoanPayments(args['payments']) if len(payments) == 0: print(f"No payments found for loan {args['payments']}") else: for payment in payments: payObj = Payment(*list(payment)) pprint(vars(payObj)) elif args['payment_debits']: debits = db.queryPaymentDebits(args['payment_debits']) if len(debits) == 0: print(f"No debits found for payment {args['payment_debits']}") else: for debit in debits: debObj = Debit(*list(debit)) pprint(vars(debObj)) else: print("No action specified!")
def test_new_loan(self): tempLoan = Loan( 1000000, 'Network 1', 'Mar', 'Loan Product 1', 1000.50, ) loanProcessor = LoanProcessor(tempLoan) newLoan = Loan( 1000001, 'Network 1', 'Mar', 'Loan Product 1', 2000.50, ) loanProcessor.processNewLoan(newLoan) self.assertEqual(3001, loanProcessor.getAggregateAmount()) newLoan = Loan( 1000002, 'Network 3', 'Mar', 'Loan Product 1', 2000.50, ) loanProcessor.processNewLoan(newLoan) self.assertEqual(2000.50, loanProcessor.getAggregateAmount())
def test_reject_loan(): loan = Loan(amount=100_000) assert not reject_loan(loan).rejected() loan = Loan(amount=250_001) assert reject_loan(loan).rejected() loan = Loan(amount=250_000) assert not reject_loan(loan).rejected()
class After(tk.Frame): def __init__(self, master): super(After, self).__init__() self.logout() self.create_acc() self.acc_detail() self.acc_balance() self.loan() self.report() self.service() self.withdraw() self.calculator() def logout(self): self.log = Out(self) self.log.place(x=1000, y=100) def create_acc(self): self.create = Create(self) self.create.place(x=400, y=200) def acc_detail(self): self.detail = Account_Detail(self) self.detail.place(x=600, y=200) def acc_balance(self): self.balance = Account_Balance(self) self.balance.place(x=800, y=200) def loan(self): self.info_loan = Loan(self) self.info_loan.place(x=1000, y=200) def report(self): self.rep = Report(self) self.rep.place(x=400, y=400) def service(self): self.serv = Service(self) self.serv.place(x=600, y=400) def withdraw(self): self.withd = Withdrawal(self) self.withd.place(x=800, y=400) def calculator(self): self.cal = Calculator(self) self.cal.place(x=1000, y=400)
def test_capital_revolver(self): start = datetime(2003, 11, 20) expiry = datetime(2006, 11, 20) revolver = Loan.new_revolver(self.LOAN_AMOUNT, start, expiry, self.HIGH_RISK_RATING) self.assertEqual(3, revolver.duration()) self.assertEqual(315, revolver.capital())
def test_capital_advised_line(self): start = datetime(2003, 11, 20) expiry = datetime(2005, 11, 20) advised_line = Loan.new_advised_line(self.LOAN_AMOUNT, start, expiry, self.HIGH_RISK_RATING) self.assertEqual(2, advised_line.duration()) self.assertEqual(21, advised_line.capital())
def save_loan(self, request, context): email = request.email interest_rate = request.interest_rate repayment_terms = request.repayment_terms loan_amount = request.loan_amount self.loan = Loan(email, interest_rate, repayment_terms, loan_amount) self.loan.save_loan() response = loan_pb2.empty() return response
def processInput(line): network, product, month_year, amount = line.split(',') global loanProcessor newLoan = Loan('', network, month_year, product, amount) if loanProcessor is not None: loanProcessor.processNewLoan(newLoan, reducer=True) else: loanProcessor = LoanProcessor(newLoan)
def retrieve_loan(): ssn = construct_random_ssn() borrowerName = random.choice(firstNames) + " " + random.choice(lastNames) salary = random.randrange(4000,10000) generatedLoan = Loan(borrowerName, ssn,salary) return generatedLoan
def test_loan_aggregation(self): tempLoan = Loan( 1000000, 'Network 1', 'Mar', 'Loan Product 1', 1000.50, ) loanProcessor = LoanProcessor(tempLoan) newLoan = Loan( 1000000, 'Network 1', 'Mar', 'Loan Product 1', 2000.50, ) loanProcessor.aggregate(newLoan) self.assertEqual(3001, loanProcessor.getAggregateAmount())
def test_if_loan_can_be_aggregated(self): tempLoan = Loan( 1000000, 'Network 1', 'Mar', 'Loan Product 1', 1000.50, ) loanProcessor = LoanProcessor(tempLoan) tempLoan = Loan( 1000000, 'Network 1', 'Mar', 'Loan Product 1', 1000.50, ) self.assertTrue(loanProcessor.canBeAggregated(tempLoan)) tempLoanWithDifferentNetwork = Loan( 1000000, 'Network 2', 'Mar', 'Loan Product 1', 1000.50, ) self.assertFalse( loanProcessor.canBeAggregated(tempLoanWithDifferentNetwork)) tempLoanWithDifferentProduct = Loan( 1000000, 'Network 1', 'Mar', 'Loan Product 2', 1000.50, ) self.assertFalse( loanProcessor.canBeAggregated(tempLoanWithDifferentProduct)) tempLoanWithDifferentMonth = Loan( 1000000, 'Network 1', 'Apr', 'Loan Product 1', 1000.50, ) self.assertFalse( loanProcessor.canBeAggregated(tempLoanWithDifferentMonth))
def getLoan(self, principal, interestRate, annualPayments, duration, startDate, loanPaymentExtra): """ Allocates a loan to the profile. """ self.hasLoan = True self.noLoans += 1 # print(f'Number of loans: {self.noLoans}') loanID = str(self.noLoans) self.loanDict[loanID] = Loan(loanID, principal, interestRate, annualPayments, duration, startDate, loanPaymentExtra)
class LoanServer(loan_pb2_grpc.LoanServicer): """ docstring """ def __init__(self): self.loan = None def save_loan(self, request, context): email = request.email interest_rate = request.interest_rate repayment_terms = request.repayment_terms loan_amount = request.loan_amount self.loan = Loan(email, interest_rate, repayment_terms, loan_amount) self.loan.save_loan() response = loan_pb2.empty() return response def show_installment(self, request, context): email = request.email response = loan_pb2.installement_response() response.installement_message = self.loan.show_installment(email) return response def repayment(self, request, context): email = request.email repayment_amount = request.repayment_amount self.loan.repayment(email, repayment_amount) response = loan_pb2.empty() return response
def test_capital_term_loan(self): start = datetime(2003, 11, 20) maturity = datetime(2006, 11, 20) term_loan = Loan.new_term_loan(self.LOAN_AMOUNT, start, maturity, self.HIGH_RISK_RATING) term_loan.payment(1000.00, datetime(2004, 11, 20)) term_loan.payment(1000.00, datetime(2005, 11, 20)) term_loan.payment(1000.00, datetime(2006, 11, 20)) self.assertEqual(2, term_loan.duration()) self.assertEqual(210, term_loan.capital())
def map_loan_row(self, row): return Loan(loan_id=row['id'], grade=row['grade'], int_rate=row['int_rate'], term=row['term'], amount=row['funded_amnt'], issue_date=row['issue_d'], last_date=row['last_pymnt_d'], investment=self.buy_size, defaults=row['defaulted'], total_payment=row['total_pymnt'], total_principle=row['total_rec_prncp'], recoveries=row['recoveries'])
def disponibilizar_livro(usuario): id_livro = int(input("INFORME O ID DO LIVRO QUE SERÁ DISPONIBILIZADO: ")) local = input("INFORME O LOCAL ONDE O LIVRO SERÁ DISPONIBILIZADO: ") livro = lista_book(id_livro, usuario) if (len(livro) > 0): novo_loan = Loan(0, usuario.id, livro[0][0], local, 'DISPONIVEL') sql_loan = f"INSERT INTO tb_loan (id_user, id_book, begin_date, coletion_location, status) values('{usuario.id}', '{novo_loan.id_book}', null, '{novo_loan.collect_location}', '{novo_loan.status}')" try: exec_command(sql_loan) print("LIVRO DISPONIBILIZADO COM SUCESSO!") except: print("FALHA AO DISPONIBILIZAR LIVRO.") else: print("ID ESCOLHIDO NÃO É VÁLIDO!")
def processInput(line): msisdn, network, loan_date, product, amount = line.split(',') msisdn = msisdn.strip("'") network = network.strip("'").strip().lower() loan_date = loan_date.strip("'") product = product.strip("'").strip().lower() amount = amount.strip("'") if (Loan.isAmountValid(amount) and Loan.isDateValid(loan_date)): month_year = Loan.extractMonthYearFromDate(loan_date) tempLoan = Loan(msisdn, network, month_year, product, amount) sys.stdout.write(tempLoan.display() + '\n')
def main(lenders_info, amount): """ Takes in data from user and feeds classes. Captures errors and prints to user :param lenders_info: csv file from user :param amount: amount to borrow input from user :return: Prints Request Amount, Rate, Total & Monthly Repayments to screen """ try: lenders = Lenders(lenders_info) selected_lenders = lenders.get_lenders(amount) loan_offer = Loan(selected_lenders, amount) loan_offer.input_validation() loan_offer.calc_offer() loan_offer.print_output() except IOError: print("No market data found for file specified") except Exception as error: print(error) else: return loan_offer
def main(): a = Asset(1000) normal_loan = Loan(12, 0.3, 1000, a) memoizable_loan = MemoizedLoan(12, 0.3, 1000, a) print "First normal loan time cost:" normal_loan.interestDue(10) print "Second normal loan time cost:" normal_loan.interestDue(10) print "First memoizable loan time cost:" memoizable_loan.interestDue(10) print "Second memoizable loan time cost:" memoizable_loan.interestDue(10)
def init_loan(self): """ Open a loan with third party lending platform. We use Poloniex for this demo. Future versions can include integrations with multiple platforms, and the ability to automatically select the optimal (most stable and profitable) asset and lending provider. Loan class has a LoanAgent which cancels old loan offers, turns auto-renew off on active loans, and creates new loan offers at fair price (fair = average of the lowest three loan offers). """ #Loan(logger, api, asset, deposit, duration, agent) return Loan( self.logger, self.LENDER_API, self.LOAN_ASSET, self.initial_deposit, self.duration, LoanAgent( self.logger, self.LENDER_API, {self.LOAN_ASSET: self.MIN_LOAN}, #{"USDC" : 50} self.initial_deposit)) #50
def crawl(): company_id = 22 url = "https://www.weidai.com.cn/?m=Biao&t=today&pageIndex=1&pageSize=8&sortField=b.verify_time&sortOrder=desc&data=null" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["data"] if len(loans) > 0: for loan in loans: if str(loan["borrow_account_scale"]) == "100.00": #放弃已经结束的 continue original_id = loan["uid"] href = "https://www.weidai.com.cn/page/borinfo.html?uid=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["borrow_account_scale"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["name"] loan_obj.borrow_amount = loan["account"] loan_obj.rate = loan["borrow_apr"] loan_obj.period = loan["borrow_period"] loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan["borrow_account_scale"]) loan_obj.cast = loan["borrow_account_yes"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 4 url = "https://www.yinhu.com/loan/loan_list.bl" request_headers = {'Referee': "https://www.yinhu.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() # offline off_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@id='loan_list']/table/tbody/tr") if len(loans) > 0: for loan in loans: href = str(loan.xpath("td[1]/p/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") try: loan_status = str(loan.xpath("td[last()]/em/span/text()")[0].encode("utf-8")).strip() except: loan_status = str(loan.xpath("td[last()]/a/span/text()")[0].encode("utf-8")).strip() if original_id and loan_status != "还款中": online_ids_set.add(original_id) if loan_status == "还款中" or loan_status == "满标": if original_id in db_ids_set: off_ids_set.add(original_id) continue if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\ .strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.yinhu.com" + href loan_obj.title = str(loan.xpath("td[1]/p/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip().replace(",", "")\ .replace("元", "") loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip() period = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\ .strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def main(): print ('========== Exersize 2.1.3 ==========\n') myLoan = Loan(360,.025,100000) print("Monthly Payment: {}".format(myLoan.monthlyPayment())) t = timer() t.start() print("Balance after 60 periods: {}".format(myLoan.balance(60))) t.end() t.start() print('Balance in period 60 computed recursivly {}'.format(myLoan.balanceRecursive(60, myLoan.face))) t.end() t.start() print("Interest due on period 60: {}".format(myLoan.interestDue(60))) t.end() t.start() print('Interest in period 60 computed recursivly {}'.format(myLoan.interestDueRecursive(60, myLoan.face))) t.end() t.start() print("Principal due on period 60: {}".format(myLoan.principlaDue(60))) t.end() t.start() print('Principal in period 60 computed recursivly {}'.format(myLoan.principalDueRecursive(60, myLoan.face))) t.end() """ On my system, in both instances the direct and recursive versions of the function run to fast to comeup with a time besides 0 However, I know that the recursive function is likely much slower """ print("The total payment should equal interest plus principal which is {}".format(myLoan.interestDue(5) + myLoan.principlaDue(5))) print("Total Interest paid is {}".format(myLoan.totalInterest())) print("Total Payment is {}".format(myLoan.totalPayments())) print("Total Interest paid is {}".format(myLoan.totalInterest())) print("Old rate {}".format(myLoan.rate)) print("Old term {}".format(myLoan.term)) print("Old face {}".format(myLoan.face)) myLoan.rate = .035 myLoan.term = 60 myLoan.face = 20000 print("New rate {}".format(myLoan.rate)) print("New term {}".format(myLoan.term)) print("New face {}".format(myLoan.face))
def crawl(): company_id = 19 url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0" request_headers = {'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") if loans_json["list"]: for i in range(0, len(loans_json["list"])): if int(loans_json["list"][i]["status"]) != 1: continue original_id = str(loans_json["list"][i]["borrowId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str(int(loans_json["list"][i]["accountYes"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id loan_obj.title = loans_json["list"][i]["name"] loan_obj.rate = str(loans_json["list"][i]["apr"]) loan_obj.period = str(loans_json["list"][i]["totalPeriod"]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.borrow_amount = str(int(loans_json["list"][i]["account"])) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str(int(loans_json["list"][i]["accountYes"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 19 url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0" request_headers = { 'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") if loans_json["list"]: for i in range(0, len(loans_json["list"])): if int(loans_json["list"][i]["status"]) != 1: continue original_id = str(loans_json["list"][i]["borrowId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str( int(loans_json["list"][i]["accountYes"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id loan_obj.title = loans_json["list"][i]["name"] loan_obj.rate = str(loans_json["list"][i]["apr"]) loan_obj.period = str(loans_json["list"][i]["totalPeriod"]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.borrow_amount = str( int(loans_json["list"][i]["account"])) loan_obj.schedule = str(loans_json["list"][i]["percent"]) loan_obj.cast = str( int(loans_json["list"][i]["accountYes"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 9 url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1" request_headers = { 'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loan_num = loans_json["totalCount"] if loans_json and loan_num: for i in range(0, loan_num): original_id = str(loans_json["data"][i]["productId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str( int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id loan_obj.title = loans_json["data"][i][ "productNameDisplay"] loan_obj.rate = str( float(loans_json["data"][i]["interestRate"]) * 100) period = str(loans_json["data"][i] ["investPeriodDisplay"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = loans_json["data"][i][ "collectionModeDisplay"] loan_obj.borrow_amount = str( int(loans_json["data"][i]["price"])) loan_obj.schedule = str( float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str( int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 21 url = "http://www.id68.cn/invest/index/borrow_status/9/p/1.html" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//ul[@class='ideal_con']/li") if len(loans) > 0: for loan in loans: if str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]) == "100.00%": #放弃已经结束的 continue href = str(loan.xpath("table/tr[1]/td[1]/a/@href")[0].encode("utf-8")) original_id = href.replace(".html", "").split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = REFEREE + href loan_obj.title = str(loan.xpath("table/tr[1]/td[1]/a/@title")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("table/tr[2]/td[last()]/span/text()")[0].encode("utf-8"))\ .strip().replace(" ", "").replace(",", "") loan_obj.repayment = str(loan.xpath("table/tr[2]/td[4]/span/text()")[0].encode("utf-8")).strip() loan_obj.rate = str(loan.xpath("table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("table/tr[2]/td[3]/span/text()")[0].encode("utf-8")).strip()\ .replace(" ", "").replace("个月", "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 7 url = "http://www.jimubox.com/Project/List?status=1" request_headers = {'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='row']/div[@class='span3 project-card']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/@href")[0]) if not href.find("Index") > 0: continue original_id = href.split("/")[3].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.jimubox.com" + href loan_obj.title = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/text()")[0].encode("utf-8")) loan_obj.description = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\ .strip() + "0000" loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\ .strip().replace("/", "").replace(",", "") rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\ .strip() if rate.find("+") > 0: rate_list = rate.split("+") loan_obj.rate = str(float(rate_list[0]) + float(rate_list[1])) else: loan_obj.rate = rate loan_obj.repayment = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h6/span/text()")[0].encode("utf-8")) loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\ .strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\ .replace("width:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 18 url = "https://www.my089.com/Loan/default.aspx?pid=1" request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath("//div[@class='Loan_box']/dl[@class='LoanList']") if len(loans) > 0: for loan in loans: if str(loan.xpath("dd[last()]/p/span/text()")[0]) == "100%": continue href = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.my089.com/Loan/" + href loan_obj.title = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")[0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str(loan.xpath("dd[3]/span/text()")[0].encode("UTF-8")).strip().replace("%/年", "") loan_obj.period = str(loan.xpath("dd[5]/span/text()")[0].encode("UTF-8")).strip().replace(" ", "") s = str(loan.xpath("dd[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("个", "") loan_obj.period_unit = s.split("/")[0].strip() loan_obj.repayment = s.split("/")[1].strip() loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 23 url = "https://member.niwodai.com/xiangmu/" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm, encoding="utf-8") loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\ .replace("共", "").replace("个标", "").strip()) if loan_size > 0: page = loan_size / 10 if loan_size % 10 > 0: page += 1 for p in range(1, page+1): page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % (p, loan_size) page_html = download_page(page_url, request_headers) page_obj = parse_html(page_html, encoding="utf-8") loans = page_obj.xpath("//div[@class='biaoList']/table/tbody/tr") for loan in loans: if lxml.html.tostring(loan).find("<th>") > 0: continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.replace(".html", "").split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = REFEREE + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("td[4]/em/text()")[0].encode("utf-8")).strip().replace(",", "") loan_obj.rate = str(loan.xpath("td[2]/em/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("td[3]/em/text()")[0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 18 url = "https://www.my089.com/Loan/default.aspx?pid=1" request_headers = { 'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath( "//div[@class='Loan_box']/dl[@class='LoanList']") if len(loans) > 0: for loan in loans: if str(loan.xpath("dd[last()]/p/span/text()") [0]) == "100%": continue href = str( loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str( loan.xpath("dd[last()]/p/span/text()")[0].encode( "UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.my089.com/Loan/" + href loan_obj.title = str( loan.xpath("dd[2]/div[@class='txt_tou']/a/@title") [0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str( loan.xpath("dd[3]/span/text()")[0].encode( "UTF-8")).strip().replace("%/年", "") loan_obj.period = str( loan.xpath("dd[5]/span/text()")[0].encode( "UTF-8")).strip().replace(" ", "") s = str(loan.xpath("dd[5]/text()")[0].encode( "UTF-8")).strip().replace(" ", "").replace("个", "") loan_obj.period_unit = s.split("/")[0].strip() loan_obj.repayment = s.split("/")[1].strip() loan_obj.schedule = str( loan.xpath("dd[last()]/p/span/text()")[0].encode( "UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 20 url = "http://www.xiaomabank.com/finance.do" request_headers = {"Referee": "http://www.xiaomabank.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr") if len(loans) > 0: for loan in loans: href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("=")[1].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = ( str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")) .replace("width:", "") .strip() .replace("%;", "") ) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.xiaomabank.com/" + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = ( str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "") ) loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = ( str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")) .replace("width:", "") .strip() .replace("%;", "") ) # 注意这里页面返回的gzip压缩后的,需要解压 resp = urllib2.urlopen(loan_obj.href) respInfo = resp.info() if ("Content-Encoding" in respInfo) and (respInfo["Content-Encoding"] == "gzip"): respHtml = zlib.decompress(resp.read(), 16 + zlib.MAX_WBITS) info_htm_parse = parse_html(respHtml, encoding="utf-8") loan_obj.repayment = str( info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8") ) loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 13 url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress=" request_headers = {'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='bidList']/li") if len(loans) > 0: for loan in loans: if not loan.xpath("div[2]/div[2]/div/div[@class='bid_empty_errortip']"): continue href = str(loan.xpath("div[2]/div/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.yirendai.com" + href loan_obj.title = str(loan.xpath("div[2]/div/h3/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\ .strip().replace(",", "") loan_obj.rate = str(loan.xpath("div[2]/div/div[3]/h4/span/text()")[0].encode("utf-8")).strip() loan_obj.period = str(loan.xpath("div[2]/div/div[4]/h4/span/text()")[0].encode("utf-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\ .replace("元", "").split("已投")[1] loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
from borrower import Borrower from investor import Investor from loan import Loan # create Borrower borrower1 = Borrower(balance=1000) print('borrower1 is created!') # List borrower1 loan Requests print('List borrower1 loan Requests') borrower1.checkMyLoans() # creat Loan instance print('create loan') loan1 = Loan() print('loan1 created') # test submitting request validation print('test submitting request validation "invalid borrower instance"') loan1.submitLoanRequest('borrower', amount=5000, installment_period=6) print('test submitting request validation invalid amount') loan1.submitLoanRequest(borrower1, amount=000, installment_period=6) print('test submitting request validation invalid installment_period') loan1.submitLoanRequest(borrower1, amount=5000, installment_period=-6) # Submit valid request print('Submit valid request') loan1.submitLoanRequest(borrower1, amount=5000, installment_period=6) # List borrower1 loan Requests print('List borrower1 loan Requests')
def crawl(): company_id = 9 url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1" request_headers = {'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loan_num = loans_json["totalCount"] if loans_json and loan_num: for i in range(0, loan_num): original_id = str(loans_json["data"][i]["productId"]) online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id loan_obj.title = loans_json["data"][i]["productNameDisplay"] loan_obj.rate = str(float(loans_json["data"][i]["interestRate"]) * 100) period = str(loans_json["data"][i]["investPeriodDisplay"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = loans_json["data"][i]["collectionModeDisplay"] loan_obj.borrow_amount = str(int(loans_json["data"][i]["price"])) loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100) loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"])) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 8 url = "http://www.eloancn.com/new/loadAllTender.action?page=3&sidx=progress&sord=desc" request_headers = {'Referee': "http://www.eloancn.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: for p in range(1, 4): url = "http://www.eloancn.com/new/loadAllTender.action?page=%s" % p logger.info("page url:%s", url) # 这个页面比较恶心,一个标的的属性不在一个div内 loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") htm_1 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd300 pdl10 fl']") htm_2 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd140 fl']") htm_3 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd130 fl pdl10']") htm_4 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd130 fl']") loan_list = [] for h1 in htm_1: loan_obj = Loan(company_id) loan_obj.title = str(h1.xpath("h3/a[@class='fl']/text()")[0].encode("utf-8")) loan_obj.href = str(h1.xpath("h3/a[@class='fl']/@href")[0]).replace(":80", "") loan_obj.original_id = loan_obj.href.split("=")[1] loan_list.append(loan_obj) for index, h2 in enumerate(htm_2): loan_list[index].borrow_amount = str(h2.xpath("p[@class='colorCb mt10']/text()")[0].encode("utf-8")).replace("¥","").replace(",","") loan_list[index].rate = str(h2.xpath("p[@class='colorE6']/span/text()")[0]).replace("%", "") for index, h3 in enumerate(htm_3): loan_list[index].period = str(h3.xpath("p/span/text()")[0].encode("utf-8")) loan_list[index].period_unit = loan_obj.PERIOD_UNIT_MONTH loan_list[index].repayment = str(h3.xpath("p[@class='']/text()")[0].encode("utf-8")) for index, h4 in enumerate(htm_4): loan_list[index].schedule = str(h4.xpath("p/span/em/text()")[0]).strip().replace("%", "") # 去掉已经满标的 new_list = [i for i in loan_list if i.schedule != "100"] for loan in new_list: online_ids_set.add(loan.original_id) if loan.original_id in db_ids_set: update_ids_set.add(loan.original_id) loan.db_update(db) else: new_ids_set.add(loan.original_id) loan.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) time.sleep(5) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj_off = Loan(company_id) loan_obj_off.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 15 url = "https://www.iqianbang.com/invest" request_headers = {"Referee": "https://www.iqianbang.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr") if len(loans) > 0: for loan in loans: if str(loan.xpath("td[7]/text()")[0].encode("utf-8")).strip() != "融资中": continue href = str(loan.xpath("td[1]/a/@href")[0]) original_id = href.split("-")[3].replace(".shtml", "") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.iqianbang.com" + href loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = ( str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace(",", "").replace("元", "") ) if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = ( str(loan.xpath("td[2]/span/span/text()")[0].encode("utf-8")).strip().replace("%", "") ) period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") # 这里需要进入详情页 loan_info_htm = download_page(loan_obj.href, headers={"Referee": url, "User-Agent": DEFAULT_UA}) loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8") loan_obj.repayment = str( loan_info_htm_parse.xpath("//div[@class='inright']/table[@class='idetable']")[0] .xpath("tr[2]/td[2]/span/text()")[0] .encode("utf-8") ).strip() loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
class Cleaner(): account = Account() card = Card() client = Client() disp = Disp() district = District() loan = Loan() order = Order() trans = Trans() def cleanData(self): print("Error in district:") self.district.clean() self.district.output("cleaned_district.csv") print("Error in account:") self.account.clean(self.district.cur) print("Error in client:") self.client.clean(self.district.cur) print("Error in disp:") self.disp.clean(self.client.cur, self.account.cur) dat = [] i = 0 j = 0 k = 0 a_len = len(self.account.cleaned_data) d_len = len(self.disp.cleaned_data) c_len = len(self.client.cleaned_data) while True: a = self.account.cleaned_data[i] d = self.disp.cleaned_data[j] c = self.client.cleaned_data[k] a_a_id = a[0] c_c_id = c[0] d_a_id = d[2] d_c_id = d[1] if a_a_id != d_a_id: while a_a_id < d_a_id: account_id, district_id, frequency, date, time = a g = 'unknown' t = 'UNKNOWN' dat.append( [account_id, district_id, frequency, date, time, g, t]) i = i + 1 a = self.account.cleaned_data[i] a_a_id = a[0] if a_a_id > d_a_id: print("Sort error") break if (c_c_id != d_c_id): print("Sort error") break g = c[2] t = d[3] if (j != d_len - 1) and (k != c_len - 1): _d = self.disp.cleaned_data[j + 1] _c = self.client.cleaned_data[k + 1] _d_a_id = _d[2] if d_a_id == _d_a_id: _d_c_id = _d[1] _c_c_id = _c[0] if _c_c_id != _d_c_id: print("Sort error") break _g = _c[2] _t = _d[3] if g != _g: g = 'couple' if t != _t: t = 'DOUBLE' j = j + 1 k = k + 1 account_id, district_id, frequency, date, time = a dat.append([account_id, district_id, frequency, date, time, g, t]) i = i + 1 j = j + 1 k = k + 1 if (i == a_len) or (j == d_len) or (k == c_len): break self.account.cleaned_data = dat self.account.output("cleaned_account.csv") self.client.output("cleaned_client.csv") self.disp.output("cleaned_disp.csv") print("Error in card:") self.card.clean(self.disp.cur) self.card.output("cleaned_card.csv") print("Error in loan:") self.loan.clean(self.account.cur) self.loan.output("cleaned_loan.csv") print("Error in order:") self.order.clean(self.account.cur) self.order.output("cleaned_order.csv") print("Error in trans:") self.trans.clean(self.account.cur) self.trans.output("cleaned_trans.csv")
def crawl(): company_id = 2 url = "http://www.ppdai.com/lend/12_s1_p1" request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) page = str(htm_obj.xpath("//div[@class='fen_ye_nav']/table/tr/td[last()]/text()")[0].encode("UTF-8"))\ .replace("共", "").replace("页", "") for p in range(1, int(page) + 1): url = "http://www.ppdai.com/lend/12_s1_p" + str(p) logger.info("page url: %s", url) loan_htm = download_page(url, request_headers) loan_obj = parse_html(loan_htm) loans = loan_obj.xpath("//div[@class='lend_nav']/table/tr") if len(loans) > 0: for loan in loans: if lxml.html.tostring(loan).find("tit_nav") > 0: continue href = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@href")[0]) original_id = href.split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1] loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id) loan_obj.original_id = original_id loan_obj.href = "http://www.ppdai.com" + href loan_obj.title = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@title")[0].encode("UTF-8")) loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("UTF-8")).strip().replace("¥", "")\ .replace(",", "") loan_obj.rate = str(loan.xpath("td[4]/text()")[0]).strip().replace("%", "") period = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "") if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = float(str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1]) loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def process_input(avalue, lface, lrate, lterm): a = Asset(float(avalue)) l = Loan(int(lterm), float(lrate), float(lface), a) return l
def crawl(): company_id = 10 url = "https://www.xinhehui.com/Financing/Invest/ajaxplist" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//table[@class='ui-record-table percentTable mt10']/tbody/tr") if len(loans) > 0: for loan in loans: if loan.xpath("td[last()]/a/@href")[0].encode("utf-8") == "javascript:;": #放弃已经结束的 continue href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8")) original_id = href.split("id%3D")[1].encode("utf-8").strip() if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) if loan.xpath("td[7]/div/a"): loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.xinhehui.com" + href title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip() if loan.xpath("td[1]/p[1]/a/em"): title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip() else: title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip() loan_obj.title = title_1 + title_2 borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "") if borrow_amount.find("万") > 0: loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000 else: loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", "")) if loan.xpath("td[4]/span"): period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip() else: period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "") loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip() if loan.xpath("td[7]/div/a"): loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 14 url = "http://www.licaifan.com" request_headers = {'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") loans = loan_htm_parse.xpath("//ul[@class='main-list tab-con2']/li[1]/table/tr") if len(loans) > 0: # 这里注意第一行是表单标题,不需要,所以从1开始 for i in range(1, len(loans)): if str(loans[i].xpath("td[last()]/a/text()")[0].encode("utf-8")) == "投资满额": continue href = str(loans[i].xpath("td[1]/h3/a/@href")[0]) original_id = href.split("/")[3] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.licaifan.com" + href loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()")[0].encode("utf-8")).strip() loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\ .strip().replace(",", "") if loan_obj.borrow_amount.find("万") > 0: loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000 loan_obj.rate = str(loans[i].xpath("td[2]/text()")[0].encode("utf-8")).strip().replace("%", "") period = str(loans[i].xpath("td[4]/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\ .replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 17 # url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512" s = int(time.time() * 1000) e = s + 1 url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d" url_1 = url % (0, s, e) request_headers = {"Referee": "http://www.touna.cn/invest-list.html", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url_1, request_headers) htm_json = loads(htm, encoding="UTF-8") page_count = htm_json["result"]["pages"]["count"] page = page_count / 10 if page_count % 10 > 0: page += 1 if page > 0: for p in range(0, page): # 重新计算当前时间 s = int(time.time() * 1000) e = s + 1 page_url = url % (p, s, e) loan_htm = download_page(page_url, request_headers) loans_json = loads(loan_htm, encoding="UTF-8") loans = loans_json["result"]["list"] for loan in loans: original_id = str(loan["id"]) if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["score"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int(original_id) loan_obj.title = loan["name"] loan_obj.borrow_amount = loan["account"] loan_obj.rate = loan["apr"] loan_obj.schedule = str(loan["score"]) loan_obj.repayment = loan["style_name"] period = str(loan["time_limit_name"].encode("utf-8")) if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 11 url = "https://www.tzydb.com" request_headers = {'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath("//div[@id='proList']/ul[@class='item_li']") if len(loans) > 0: for loan in loans: schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip() if schedule == "100%" or schedule == "100.0%": #放弃已经结束的 continue # link = https://www.tzydb.com/boot/lookup/971,1017 a_script = str(loan.xpath("li/div[1]/div[1]/div/a/@href")[0].encode("utf-8")) o_id = ID_RE.findall(a_script)[0] original_id = o_id.replace(",", "-") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.tzydb.com/boot/lookup/" + o_id loan_obj.title = str(loan.xpath("li/div[1]/div[1]/div/a/text()")[0].encode("utf-8")) loan_obj.borrow_amount = str(loan.xpath("li/div[2]/div[1]/span/text()")[0].encode("utf-8")).strip()\ .replace(" ", "").replace(",", "") loan_obj.period = str(loan.xpath("li/div[2]/div[3]/span/text()")[0].encode("UTF-8")).strip() loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str(loan.xpath("li/div[2]/div[2]/span/text()")[0]).strip().replace("%", "") loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 25 url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \ "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \ "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page((url % 1), request_headers) obj = loads(htm, encoding="utf-8") total = int(obj["total"]) if total > 0: page = total / 5 if total % 5 > 0: page += 1 for p in range(1, page+1): htm = download_page((url % p), request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["projectList"] for loan in loans: original_id = loan["ID"] href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["Title"] loan_obj.borrow_amount = loan["TotalAmount"] loan_obj.rate = loan["YearRate"] loan_obj.period = loan["Deadline"] loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.schedule = str(loan["ProgressPercent"]) loan_obj.cast = loan["ProgressAmount"] loan_obj.repayment = loan["RepaymentTypeDesc"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 16 # url = "http://www.itouzi.com/dinvest/invest/index" url = "http://www.itouzi.com/dinvest/debt/index" request_headers = {"Referee": "http://www.itouzi.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="UTF-8") # 注意ul的class后面有个空格 loans = loan_htm_parse.xpath("//ul[@class='invest-product-case-list mtn btn clearfix ']/li") if len(loans) > 0: for loan in loans: if not loan.xpath("div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']"): continue href = str(loan.xpath("h2/a[@class='fl']/@href")[0]) original_id = href.split("id=")[1] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) # loan_obj = Loan(company_id, original_id) # loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "") # loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.itouzi.com" + href loan_obj.title = str(loan.xpath("h2/a[@class='fl']/text()")[0].encode("utf-8")).strip() loan_obj.repayment = ( str(loan.xpath("p/span[2]/text()")[0].encode("utf-8")).strip().replace("还款方式:", "") ) loan_obj.borrow_amount = int(loan.xpath("p/span[3]/strong/text()")[0]) * 10000 loan_obj.rate = ( str(loan.xpath("p/span[5]/em[1]/text()")[0].encode("utf-8")).strip().replace("%", "") ) period = str(loan.xpath("p/span[4]/strong/text()")[0].encode("utf-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH # 这个进度这块还不确定,需等有标时检查一遍 if loan.xpath("div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']"): loan_obj.schedule = ( str( loan.xpath( "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()" )[0].encode("utf-8") ) .strip() .replace("%", "") ) print loan_obj.schedule # loan_obj.db_create(db) # # logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # ## db - 新抓取的 = 就是要下线的 # off_ids_set = db_ids_set - online_ids_set # if off_ids_set: # loan_obj = Loan(company_id) # loan_obj.db_offline(db, off_ids_set) # logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 24 url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = loads(htm, encoding="utf-8") loans = htm_obj["list"] if len(loans) > 0: for loan in loans: if str(loan["bID_SCHD"]) == "100": #放弃已经结束的 continue original_id = loan["lN_NO"] href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = str(loan["bID_SCHD"]) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = href loan_obj.title = loan["lN_NM"] loan_obj.borrow_amount = loan["lN_AMT"] loan_obj.rate = loan["lN_RATE"] loan_obj.period = loan["lN_TERM"] loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"] loan_obj.schedule = str(loan["bID_SCHD"]) loan_obj.repayment = loan["pAY_METH_DESC"] loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 3 url = "http://www.91wangcai.com/invest/index.html" request_headers = { 'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA } db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="gb2312") loans = loan_htm_parse.xpath("//div[@class='proBoxNew']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='hd']/a/@href")[0]) original_id = href.split(".")[0].split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.91wangcai.com" + href loan_obj.title = autodecode( str( loan.xpath("div[@class='hd']/a/text()")[0].encode( "gb2312"))).encode("utf-8") loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("¥", "") loan_obj.rate = str( loan.xpath( "div[@class='bd']/table/tr[1]/td[2]/em/text()") [0]).strip().replace("%", "") loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \ .replace("<em>", "").replace("</em>", "") html_parser = HTMLParser.HTMLParser() period = html_parser.unescape(loan_period_text).encode( "utf-8").strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("还款方式:", "") loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 3 url = "http://www.91wangcai.com/invest/index.html" request_headers = {'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="gb2312") loans = loan_htm_parse.xpath("//div[@class='proBoxNew']") if len(loans) > 0: for loan in loans: href = str(loan.xpath("div[@class='hd']/a/@href")[0]) original_id = href.split(".")[0].split("/")[2].encode("utf-8") if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.91wangcai.com" + href loan_obj.title = autodecode(str(loan.xpath("div[@class='hd']/a/text()")[0].encode("gb2312"))).encode("utf-8") loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("¥", "") loan_obj.rate = str(loan.xpath("div[@class='bd']/table/tr[1]/td[2]/em/text()")[0]).strip().replace("%", "") loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \ .replace("<em>", "").replace("</em>", "") html_parser = HTMLParser.HTMLParser() period = html_parser.unescape(loan_period_text).encode("utf-8").strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("还款方式:", "") loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \ .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "") loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 10 url = "https://www.xinhehui.com/Financing/Invest/ajaxplist" request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA} db = get_db_engine() db_ids = list( db.execute( "select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: htm = download_page(url, request_headers) htm_obj = parse_html(htm) loans = htm_obj.xpath( "//table[@class='ui-record-table percentTable mt10']/tbody/tr") if len(loans) > 0: for loan in loans: if loan.xpath("td[last()]/a/@href")[0].encode( "utf-8") == "javascript:;": #放弃已经结束的 continue href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8")) original_id = href.split("id%3D")[1].encode("utf-8").strip() if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) if loan.xpath("td[7]/div/a"): loan_obj.schedule = str( loan.xpath("td[7]/div/a/text()")[0].encode( "UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "https://www.xinhehui.com" + href title_1 = str( loan.xpath("td[1]/p[1]/a/text()")[0].encode( "utf-8")).strip() if loan.xpath("td[1]/p[1]/a/em"): title_2 = str( loan.xpath("td[1]/p[1]/a/em/text()")[0].encode( "utf-8")).strip() else: title_2 = str( loan.xpath("td[1]/p[1]/a/span/text()")[0].encode( "utf-8")).strip() loan_obj.title = title_1 + title_2 borrow_amount = str( loan.xpath("td[2]/span/text()")[0].encode( "utf-8")).strip().replace(" ", "") if borrow_amount.find("万") > 0: loan_obj.borrow_amount = float( borrow_amount.replace("万", "")) * 10000 else: loan_obj.borrow_amount = float( borrow_amount.replace("元", "").replace(",", "")) if loan.xpath("td[4]/span"): period = str( loan.xpath("td[4]/span/@title")[0].encode( "UTF-8")).strip() else: period = str( loan.xpath("td[4]/text()")[0].encode( "UTF-8")).strip() if period.find(loan_obj.PERIOD_UNIT_DAY) > 0: loan_obj.period = period.replace( loan_obj.PERIOD_UNIT_DAY, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY else: loan_obj.period = period.replace("个", "").replace( loan_obj.PERIOD_UNIT_MONTH, "") loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_obj.rate = str( loan.xpath("td[3]/p/text()")[0]).strip().replace( "%", "") loan_obj.repayment = str( loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip() if loan.xpath("td[7]/div/a"): loan_obj.schedule = str( loan.xpath("td[7]/div/a/text()")[0].encode( "UTF-8")).strip().replace("%", "") else: loan_obj.schedule = "0" loan_obj.db_create(db) logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set)) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawl(): company_id = 26 url = "http://www.longlongweb.com/invests" request_headers = {"Referee": "http://www.longlongweb.com", "User-Agent": DEFAULT_UA} db = get_db_engine() db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id)) # db all db_ids_set = set() # 在线的所有id online_ids_set = set() # new new_ids_set = set() # update update_ids_set = set() for id in db_ids: db_ids_set.add(id[0].encode("utf-8")) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() try: loan_htm = download_page(url, request_headers) loan_htm_parse = parse_html(loan_htm, encoding="utf-8") loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl") if len(loans) > 0: for loan in loans: if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0: continue href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0]) original_id = href.split("/")[2] if original_id: online_ids_set.add(original_id) if original_id in db_ids_set: update_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.borrow_amount = ( str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.href = "http://www.longlongweb.com" + href loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = ( str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.db_update(db) else: new_ids_set.add(original_id) loan_obj = Loan(company_id, original_id) loan_obj.href = "http://www.longlongweb.com" + href loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8")) loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).replace( "%", "" ) loan_obj.borrow_amount = ( str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0]) loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH loan_info_htm = download_page(loan_obj.href, request_headers) loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8") loan_obj.cast = ( str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8")) .replace(",", "") .replace("¥", "") .strip() ) loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100) loan_obj.repayment = ( str( loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0].encode( "utf-8" ) ) .strip() .replace("还款方式:", "") ) loan_obj.db_create(db) logger.info( "company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set), ) # db - 新抓取的 = 就是要下线的 off_ids_set = db_ids_set - online_ids_set if off_ids_set: loan_obj = Loan(company_id) loan_obj.db_offline(db, off_ids_set) logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set)) except: logger.error("url: %s xpath failed:%s", url, traceback.format_exc())