Beispiel #1
0
def main(args):
    db = Database()
    if args['initialize']:
        samples = get_input()
        for sample in samples:
            loan = Loan(sample)
            loan.generatePaymentSchedule(months=3)
        print("Finished populating db!")
    elif args['debits']:
        debits = db.queryLoanDebits(args['debits'])
        if len(debits) == 0:
            print(f"No debits found for loan {args['debits']}")
        else:
            for debit in debits:
                debObj = Debit(*list(debit))
                pprint(vars(debObj))
    elif args['payments']:
        payments = db.queryLoanPayments(args['payments'])
        if len(payments) == 0:
            print(f"No payments found for loan {args['payments']}")
        else:
            for payment in payments:
                payObj = Payment(*list(payment))
                pprint(vars(payObj))
    elif args['payment_debits']:
        debits = db.queryPaymentDebits(args['payment_debits'])
        if len(debits) == 0:
            print(f"No debits found for payment {args['payment_debits']}")
        else:
            for debit in debits:
                debObj = Debit(*list(debit))
                pprint(vars(debObj))
    else:
        print("No action specified!")
 def test_new_loan(self):
     tempLoan = Loan(
         1000000,
         'Network 1',
         'Mar',
         'Loan Product 1',
         1000.50,
     )
     loanProcessor = LoanProcessor(tempLoan)
     newLoan = Loan(
         1000001,
         'Network 1',
         'Mar',
         'Loan Product 1',
         2000.50,
     )
     loanProcessor.processNewLoan(newLoan)
     self.assertEqual(3001, loanProcessor.getAggregateAmount())
     newLoan = Loan(
         1000002,
         'Network 3',
         'Mar',
         'Loan Product 1',
         2000.50,
     )
     loanProcessor.processNewLoan(newLoan)
     self.assertEqual(2000.50, loanProcessor.getAggregateAmount())
def test_reject_loan():
    loan = Loan(amount=100_000)
    assert not reject_loan(loan).rejected()

    loan = Loan(amount=250_001)
    assert reject_loan(loan).rejected()

    loan = Loan(amount=250_000)
    assert not reject_loan(loan).rejected()
class After(tk.Frame):
    def __init__(self, master):
        super(After, self).__init__()
        self.logout()
        self.create_acc()
        self.acc_detail()
        self.acc_balance()
        self.loan()
        self.report()
        self.service()
        self.withdraw()
        self.calculator()

    def logout(self):
        self.log = Out(self)
        self.log.place(x=1000, y=100)

    def create_acc(self):
        self.create = Create(self)
        self.create.place(x=400, y=200)

    def acc_detail(self):
        self.detail = Account_Detail(self)
        self.detail.place(x=600, y=200)

    def acc_balance(self):
        self.balance = Account_Balance(self)
        self.balance.place(x=800, y=200)

    def loan(self):
        self.info_loan = Loan(self)
        self.info_loan.place(x=1000, y=200)

    def report(self):
        self.rep = Report(self)
        self.rep.place(x=400, y=400)

    def service(self):
        self.serv = Service(self)
        self.serv.place(x=600, y=400)

    def withdraw(self):
        self.withd = Withdrawal(self)
        self.withd.place(x=800, y=400)

    def calculator(self):
        self.cal = Calculator(self)
        self.cal.place(x=1000, y=400)
Beispiel #5
0
    def test_capital_revolver(self):
        start = datetime(2003, 11, 20)
        expiry = datetime(2006, 11, 20)
        revolver = Loan.new_revolver(self.LOAN_AMOUNT, start, expiry, self.HIGH_RISK_RATING)

        self.assertEqual(3, revolver.duration())
        self.assertEqual(315, revolver.capital())
Beispiel #6
0
    def test_capital_advised_line(self):
        start = datetime(2003, 11, 20)
        expiry = datetime(2005, 11, 20)
        advised_line = Loan.new_advised_line(self.LOAN_AMOUNT, start, expiry, self.HIGH_RISK_RATING)

        self.assertEqual(2, advised_line.duration())
        self.assertEqual(21, advised_line.capital())
Beispiel #7
0
    def save_loan(self, request, context):
        email = request.email

        interest_rate = request.interest_rate

        repayment_terms = request.repayment_terms

        loan_amount = request.loan_amount

        self.loan = Loan(email, interest_rate, repayment_terms, loan_amount)

        self.loan.save_loan()

        response = loan_pb2.empty()

        return response
Beispiel #8
0
def processInput(line):
    network, product, month_year, amount = line.split(',')
    global loanProcessor
    newLoan = Loan('', network, month_year, product, amount)
    if loanProcessor is not None:
        loanProcessor.processNewLoan(newLoan, reducer=True)
    else:
        loanProcessor = LoanProcessor(newLoan)
Beispiel #9
0
def retrieve_loan():
    
    ssn = construct_random_ssn()
    borrowerName = random.choice(firstNames) + " " + random.choice(lastNames)
    salary = random.randrange(4000,10000)

    generatedLoan = Loan(borrowerName, ssn,salary)
    
    return generatedLoan
 def test_loan_aggregation(self):
     tempLoan = Loan(
         1000000,
         'Network 1',
         'Mar',
         'Loan Product 1',
         1000.50,
     )
     loanProcessor = LoanProcessor(tempLoan)
     newLoan = Loan(
         1000000,
         'Network 1',
         'Mar',
         'Loan Product 1',
         2000.50,
     )
     loanProcessor.aggregate(newLoan)
     self.assertEqual(3001, loanProcessor.getAggregateAmount())
 def test_if_loan_can_be_aggregated(self):
     tempLoan = Loan(
         1000000,
         'Network 1',
         'Mar',
         'Loan Product 1',
         1000.50,
     )
     loanProcessor = LoanProcessor(tempLoan)
     tempLoan = Loan(
         1000000,
         'Network 1',
         'Mar',
         'Loan Product 1',
         1000.50,
     )
     self.assertTrue(loanProcessor.canBeAggregated(tempLoan))
     tempLoanWithDifferentNetwork = Loan(
         1000000,
         'Network 2',
         'Mar',
         'Loan Product 1',
         1000.50,
     )
     self.assertFalse(
         loanProcessor.canBeAggregated(tempLoanWithDifferentNetwork))
     tempLoanWithDifferentProduct = Loan(
         1000000,
         'Network 1',
         'Mar',
         'Loan Product 2',
         1000.50,
     )
     self.assertFalse(
         loanProcessor.canBeAggregated(tempLoanWithDifferentProduct))
     tempLoanWithDifferentMonth = Loan(
         1000000,
         'Network 1',
         'Apr',
         'Loan Product 1',
         1000.50,
     )
     self.assertFalse(
         loanProcessor.canBeAggregated(tempLoanWithDifferentMonth))
    def getLoan(self, principal, interestRate, annualPayments, duration, startDate, loanPaymentExtra):
        """
        Allocates a loan to the profile.
        """
        
        self.hasLoan = True
        self.noLoans += 1
#         print(f'Number of loans: {self.noLoans}')
        loanID = str(self.noLoans)
        self.loanDict[loanID] = Loan(loanID, principal, interestRate, annualPayments, duration, startDate, loanPaymentExtra)
Beispiel #13
0
class LoanServer(loan_pb2_grpc.LoanServicer):
    """
    docstring
    """
    def __init__(self):
        self.loan = None

    def save_loan(self, request, context):
        email = request.email

        interest_rate = request.interest_rate

        repayment_terms = request.repayment_terms

        loan_amount = request.loan_amount

        self.loan = Loan(email, interest_rate, repayment_terms, loan_amount)

        self.loan.save_loan()

        response = loan_pb2.empty()

        return response

    def show_installment(self, request, context):
        email = request.email

        response = loan_pb2.installement_response()

        response.installement_message = self.loan.show_installment(email)

        return response

    def repayment(self, request, context):
        email = request.email

        repayment_amount = request.repayment_amount

        self.loan.repayment(email, repayment_amount)

        response = loan_pb2.empty()

        return response
Beispiel #14
0
    def test_capital_term_loan(self):
        start = datetime(2003, 11, 20)
        maturity = datetime(2006, 11, 20)
        term_loan = Loan.new_term_loan(self.LOAN_AMOUNT, start, maturity, self.HIGH_RISK_RATING)

        term_loan.payment(1000.00, datetime(2004, 11, 20))
        term_loan.payment(1000.00, datetime(2005, 11, 20))
        term_loan.payment(1000.00, datetime(2006, 11, 20))

        self.assertEqual(2, term_loan.duration())
        self.assertEqual(210, term_loan.capital())
Beispiel #15
0
 def map_loan_row(self, row):
     return Loan(loan_id=row['id'],
                 grade=row['grade'],
                 int_rate=row['int_rate'],
                 term=row['term'],
                 amount=row['funded_amnt'],
                 issue_date=row['issue_d'],
                 last_date=row['last_pymnt_d'],
                 investment=self.buy_size,
                 defaults=row['defaulted'],
                 total_payment=row['total_pymnt'],
                 total_principle=row['total_rec_prncp'],
                 recoveries=row['recoveries'])
def disponibilizar_livro(usuario):
    id_livro = int(input("INFORME O ID DO LIVRO QUE SERÁ DISPONIBILIZADO: "))
    local = input("INFORME O LOCAL ONDE O LIVRO SERÁ DISPONIBILIZADO: ")
    livro = lista_book(id_livro, usuario)

    if (len(livro) > 0):
        novo_loan = Loan(0, usuario.id, livro[0][0], local, 'DISPONIVEL')
        sql_loan = f"INSERT INTO tb_loan (id_user, id_book, begin_date, coletion_location, status) values('{usuario.id}', '{novo_loan.id_book}', null, '{novo_loan.collect_location}', '{novo_loan.status}')"
        try:
            exec_command(sql_loan)
            print("LIVRO DISPONIBILIZADO COM SUCESSO!")
        except:
            print("FALHA AO DISPONIBILIZAR LIVRO.")
    else:
        print("ID ESCOLHIDO NÃO É VÁLIDO!")
Beispiel #17
0
def processInput(line):
    msisdn, network, loan_date, product, amount = line.split(',')
    msisdn = msisdn.strip("'")
    network = network.strip("'").strip().lower()
    loan_date = loan_date.strip("'")
    product = product.strip("'").strip().lower()
    amount = amount.strip("'")
    if (Loan.isAmountValid(amount) and Loan.isDateValid(loan_date)):
        month_year = Loan.extractMonthYearFromDate(loan_date)
        tempLoan = Loan(msisdn, network, month_year, product, amount)
        sys.stdout.write(tempLoan.display() + '\n')
Beispiel #18
0
def main(lenders_info, amount):
    """
    Takes in data from user and feeds classes. Captures errors and prints to user
    :param lenders_info: csv file from user
    :param amount: amount to borrow input from user
    :return: Prints Request Amount, Rate, Total & Monthly Repayments to screen
    """
    try:
        lenders = Lenders(lenders_info)
        selected_lenders = lenders.get_lenders(amount)
        loan_offer = Loan(selected_lenders, amount)
        loan_offer.input_validation()
        loan_offer.calc_offer()
        loan_offer.print_output()
    except IOError:
        print("No market data found for file specified")
    except Exception as error:
        print(error)
    else:
        return loan_offer
Beispiel #19
0
def main():
    a = Asset(1000)
    normal_loan = Loan(12, 0.3, 1000, a)
    memoizable_loan = MemoizedLoan(12, 0.3, 1000, a)

    print "First normal loan time cost:"
    normal_loan.interestDue(10)
    print "Second normal loan time cost:"
    normal_loan.interestDue(10)

    print "First memoizable loan time cost:"
    memoizable_loan.interestDue(10)
    print "Second memoizable loan time cost:"
    memoizable_loan.interestDue(10)
Beispiel #20
0
 def init_loan(self):
     """ Open a loan with third party lending platform. We use Poloniex
         for this demo. Future versions can include integrations with
         multiple platforms, and the ability to automatically select the
         optimal (most stable and profitable) asset and lending provider.
         Loan class has a LoanAgent which cancels old loan offers, turns
         auto-renew off on active loans, and creates new loan offers at
         fair price (fair = average of the lowest three loan offers).
         """
     #Loan(logger, api, asset, deposit, duration, agent)
     return Loan(
         self.logger,
         self.LENDER_API,
         self.LOAN_ASSET,
         self.initial_deposit,
         self.duration,
         LoanAgent(
             self.logger,
             self.LENDER_API,
             {self.LOAN_ASSET: self.MIN_LOAN},       #{"USDC" : 50}
             self.initial_deposit))                  #50
Beispiel #21
0
def crawl():
    company_id = 22
    url = "https://www.weidai.com.cn/?m=Biao&t=today&pageIndex=1&pageSize=8&sortField=b.verify_time&sortOrder=desc&data=null"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = loads(htm, encoding="utf-8")
        loans = htm_obj["data"]
        if len(loans) > 0:
            for loan in loans:
                if str(loan["borrow_account_scale"]) == "100.00":
                    #放弃已经结束的
                    continue
                original_id = loan["uid"]
                href = "https://www.weidai.com.cn/page/borinfo.html?uid=" + original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan["borrow_account_scale"])
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = loan["name"]
                    loan_obj.borrow_amount = loan["account"]
                    loan_obj.rate = loan["borrow_apr"]
                    loan_obj.period = loan["borrow_period"]
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan["borrow_account_scale"])
                    loan_obj.cast = loan["borrow_account_yes"]
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #22
0
def crawl():
    company_id = 4
    url = "https://www.yinhu.com/loan/loan_list.bl"
    request_headers = {'Referee': "https://www.yinhu.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()
    # offline
    off_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@id='loan_list']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/p/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")
                try:
                    loan_status = str(loan.xpath("td[last()]/em/span/text()")[0].encode("utf-8")).strip()
                except:
                    loan_status = str(loan.xpath("td[last()]/a/span/text()")[0].encode("utf-8")).strip()

                if original_id and loan_status != "还款中":
                    online_ids_set.add(original_id)

                if loan_status == "还款中" or loan_status == "满标":
                    if original_id in db_ids_set:
                        off_ids_set.add(original_id)
                    continue

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\
                        .strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.yinhu.com" + href
                    loan_obj.title = str(loan.xpath("td[1]/p/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip().replace(",", "")\
                        .replace("元", "")

                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip()
                    period = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\
                        .strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #23
0
def main():
    print ('========== Exersize 2.1.3 ==========\n')
    myLoan = Loan(360,.025,100000)
    print("Monthly Payment: {}".format(myLoan.monthlyPayment()))
    t = timer()
    t.start()
    print("Balance after 60 periods: {}".format(myLoan.balance(60)))
    t.end()
    t.start()
    print('Balance in period 60 computed recursivly {}'.format(myLoan.balanceRecursive(60, myLoan.face)))
    t.end()
    t.start()
    print("Interest due on period 60: {}".format(myLoan.interestDue(60)))
    t.end()
    t.start()
    print('Interest in period 60 computed recursivly {}'.format(myLoan.interestDueRecursive(60, myLoan.face)))
    t.end()
    t.start()
    print("Principal due on period 60: {}".format(myLoan.principlaDue(60)))
    t.end()
    t.start()
    print('Principal in period 60 computed recursivly {}'.format(myLoan.principalDueRecursive(60, myLoan.face)))
    t.end()

    """
    On my system, in both instances the direct and recursive versions of the function run to fast to comeup with a time besides 0
    However, I know that the recursive function is likely much slower
    """

    print("The total payment should equal interest plus principal which is {}".format(myLoan.interestDue(5) + myLoan.principlaDue(5)))
    print("Total Interest paid is {}".format(myLoan.totalInterest()))
    print("Total Payment is {}".format(myLoan.totalPayments()))
    print("Total Interest paid is {}".format(myLoan.totalInterest()))

    print("Old rate {}".format(myLoan.rate))
    print("Old term {}".format(myLoan.term))
    print("Old face {}".format(myLoan.face))

    myLoan.rate = .035
    myLoan.term = 60
    myLoan.face = 20000

    print("New rate {}".format(myLoan.rate))
    print("New term {}".format(myLoan.term))
    print("New face {}".format(myLoan.face))
Beispiel #24
0
def crawl():
    company_id = 19
    url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0"
    request_headers = {'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")

        if loans_json["list"]:
            for i in range(0, len(loans_json["list"])):
                if int(loans_json["list"][i]["status"]) != 1:
                    continue
                original_id = str(loans_json["list"][i]["borrowId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id
                    loan_obj.title = loans_json["list"][i]["name"]
                    loan_obj.rate = str(loans_json["list"][i]["apr"])
                    loan_obj.period = str(loans_json["list"][i]["totalPeriod"])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.borrow_amount = str(int(loans_json["list"][i]["account"]))
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #25
0
def crawl():
    company_id = 19
    url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0"
    request_headers = {
        'Referee': "https://www.qian360.com/tl/select.html",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")

        if loans_json["list"]:
            for i in range(0, len(loans_json["list"])):
                if int(loans_json["list"][i]["status"]) != 1:
                    continue
                original_id = str(loans_json["list"][i]["borrowId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(
                        int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id
                    loan_obj.title = loans_json["list"][i]["name"]
                    loan_obj.rate = str(loans_json["list"][i]["apr"])
                    loan_obj.period = str(loans_json["list"][i]["totalPeriod"])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.borrow_amount = str(
                        int(loans_json["list"][i]["account"]))
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(
                        int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #26
0
def crawl():
    company_id = 9
    url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1"
    request_headers = {
        'Referee': "https://list.lufax.com/list/listing/fuying",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        loan_num = loans_json["totalCount"]
        if loans_json and loan_num:
            for i in range(0, loan_num):
                original_id = str(loans_json["data"][i]["productId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(
                        float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(
                        int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id
                    loan_obj.title = loans_json["data"][i][
                        "productNameDisplay"]
                    loan_obj.rate = str(
                        float(loans_json["data"][i]["interestRate"]) * 100)
                    period = str(loans_json["data"][i]
                                 ["investPeriodDisplay"].encode("utf-8"))
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.repayment = loans_json["data"][i][
                        "collectionModeDisplay"]
                    loan_obj.borrow_amount = str(
                        int(loans_json["data"][i]["price"]))
                    loan_obj.schedule = str(
                        float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(
                        int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #27
0
def crawl():
    company_id = 21
    url = "http://www.id68.cn/invest/index/borrow_status/9/p/1.html"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//ul[@class='ideal_con']/li")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]) == "100.00%":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("table/tr[1]/td[1]/a/@href")[0].encode("utf-8"))
                original_id = href.replace(".html", "").split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = REFEREE + href
                    loan_obj.title = str(loan.xpath("table/tr[1]/td[1]/a/@title")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("table/tr[2]/td[last()]/span/text()")[0].encode("utf-8"))\
                        .strip().replace(" ", "").replace(",", "")
                    loan_obj.repayment = str(loan.xpath("table/tr[2]/td[4]/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.rate = str(loan.xpath("table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("table/tr[2]/td[3]/span/text()")[0].encode("utf-8")).strip()\
                        .replace(" ", "").replace("个月", "")
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #28
0
def crawl():
    company_id = 7
    url = "http://www.jimubox.com/Project/List?status=1"
    request_headers = {'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='row']/div[@class='span3 project-card']")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/@href")[0])
                if not href.find("Index") > 0:
                    continue
                original_id = href.split("/")[3].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.jimubox.com" + href
                    loan_obj.title = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/text()")[0].encode("utf-8"))
                    loan_obj.description = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\
                        .strip() + "0000"
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")

                    rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    if rate.find("+") > 0:
                        rate_list = rate.split("+")
                        loan_obj.rate = str(float(rate_list[0]) + float(rate_list[1]))
                    else:
                        loan_obj.rate = rate
                    loan_obj.repayment = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h6/span/text()")[0].encode("utf-8"))
                    loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #29
0
def crawl():
    company_id = 18
    url = "https://www.my089.com/Loan/default.aspx?pid=1"
    request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")
        for p in range(1, int(page) + 1):
            url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath("//div[@class='Loan_box']/dl[@class='LoanList']")
            if len(loans) > 0:
                for loan in loans:
                    if str(loan.xpath("dd[last()]/p/span/text()")[0]) == "100%":
                        continue
                    href = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0])
                    original_id = href.split("=")[1].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "https://www.my089.com/Loan/" + href
                        loan_obj.title = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")[0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(loan.xpath("dd[3]/span/text()")[0].encode("UTF-8")).strip().replace("%/年", "")
                        loan_obj.period = str(loan.xpath("dd[5]/span/text()")[0].encode("UTF-8")).strip().replace(" ", "")
                        s = str(loan.xpath("dd[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("个", "")
                        loan_obj.period_unit = s.split("/")[0].strip()
                        loan_obj.repayment = s.split("/")[1].strip()
                        loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "")
                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #30
0
def crawl():
    company_id = 23
    url = "https://member.niwodai.com/xiangmu/"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm, encoding="utf-8")
        loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\
            .replace("共", "").replace("个标", "").strip())
        if loan_size > 0:
            page = loan_size / 10
            if loan_size % 10 > 0:
                page += 1
            for p in range(1, page+1):
                page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % (p, loan_size)
                page_html = download_page(page_url, request_headers)
                page_obj = parse_html(page_html, encoding="utf-8")
                loans = page_obj.xpath("//div[@class='biaoList']/table/tbody/tr")
                for loan in loans:
                    if lxml.html.tostring(loan).find("<th>") > 0:
                        continue
                    href = str(loan.xpath("td[1]/a/@href")[0])
                    original_id = href.replace(".html", "").split("/")[2]
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = REFEREE + href
                        loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                        loan_obj.borrow_amount = str(loan.xpath("td[4]/em/text()")[0].encode("utf-8")).strip().replace(",", "")
                        loan_obj.rate = str(loan.xpath("td[2]/em/text()")[0].encode("utf-8")).strip().replace("%", "")
                        loan_obj.period = str(loan.xpath("td[3]/em/text()")[0].encode("utf-8")).strip()
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "")

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #31
0
def crawl():
    company_id = 18
    url = "https://www.my089.com/Loan/default.aspx?pid=1"
    request_headers = {
        'Referee': "http://www.ppdai.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")
        for p in range(1, int(page) + 1):
            url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath(
                "//div[@class='Loan_box']/dl[@class='LoanList']")
            if len(loans) > 0:
                for loan in loans:
                    if str(loan.xpath("dd[last()]/p/span/text()")
                           [0]) == "100%":
                        continue
                    href = str(
                        loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0])
                    original_id = href.split("=")[1].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(
                            loan.xpath("dd[last()]/p/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "https://www.my089.com/Loan/" + href
                        loan_obj.title = str(
                            loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")
                            [0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(
                            loan.xpath("dd[3]/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%/年", "")
                        loan_obj.period = str(
                            loan.xpath("dd[5]/span/text()")[0].encode(
                                "UTF-8")).strip().replace(" ", "")
                        s = str(loan.xpath("dd[5]/text()")[0].encode(
                            "UTF-8")).strip().replace(" ",
                                                      "").replace("个", "")
                        loan_obj.period_unit = s.split("/")[0].strip()
                        loan_obj.repayment = s.split("/")[1].strip()
                        loan_obj.schedule = str(
                            loan.xpath("dd[last()]/p/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #32
0
def crawl():
    company_id = 20
    url = "http://www.xiaomabank.com/finance.do"
    request_headers = {"Referee": "http://www.xiaomabank.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = (
                        str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8"))
                        .replace("width:", "")
                        .strip()
                        .replace("%;", "")
                    )
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.xiaomabank.com/" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = (
                        str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "")
                    )
                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = (
                        str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8"))
                        .replace("width:", "")
                        .strip()
                        .replace("%;", "")
                    )

                    # 注意这里页面返回的gzip压缩后的,需要解压
                    resp = urllib2.urlopen(loan_obj.href)
                    respInfo = resp.info()
                    if ("Content-Encoding" in respInfo) and (respInfo["Content-Encoding"] == "gzip"):
                        respHtml = zlib.decompress(resp.read(), 16 + zlib.MAX_WBITS)
                        info_htm_parse = parse_html(respHtml, encoding="utf-8")
                        loan_obj.repayment = str(
                            info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8")
                        )

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #33
0
def crawl():
    company_id = 13
    url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress="
    request_headers = {'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//ul[@class='bidList']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath("div[2]/div[2]/div/div[@class='bid_empty_errortip']"):
                    continue
                href = str(loan.xpath("div[2]/div/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.yirendai.com" + href
                    loan_obj.title = str(loan.xpath("div[2]/div/h3/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.rate = str(loan.xpath("div[2]/div/div[3]/h4/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.period = str(loan.xpath("div[2]/div/div[4]/h4/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #34
0
from borrower import Borrower
from investor import Investor
from loan import Loan

# create Borrower
borrower1 = Borrower(balance=1000)
print('borrower1 is created!')

# List borrower1 loan Requests
print('List borrower1 loan Requests')
borrower1.checkMyLoans()

# creat Loan instance
print('create loan')
loan1 = Loan()
print('loan1 created')

# test submitting request validation
print('test submitting request validation "invalid borrower instance"')
loan1.submitLoanRequest('borrower', amount=5000, installment_period=6)
print('test submitting request validation invalid amount')
loan1.submitLoanRequest(borrower1, amount=000, installment_period=6)
print('test submitting request validation invalid installment_period')
loan1.submitLoanRequest(borrower1, amount=5000, installment_period=-6)

# Submit valid request
print('Submit valid request')
loan1.submitLoanRequest(borrower1, amount=5000, installment_period=6)

# List borrower1 loan Requests
print('List borrower1 loan Requests')
Beispiel #35
0
def crawl():
    company_id = 9
    url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1"
    request_headers = {'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        loan_num = loans_json["totalCount"]
        if loans_json and loan_num:
            for i in range(0, loan_num):
                original_id = str(loans_json["data"][i]["productId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id
                    loan_obj.title = loans_json["data"][i]["productNameDisplay"]
                    loan_obj.rate = str(float(loans_json["data"][i]["interestRate"]) * 100)
                    period = str(loans_json["data"][i]["investPeriodDisplay"].encode("utf-8"))
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.repayment = loans_json["data"][i]["collectionModeDisplay"]
                    loan_obj.borrow_amount = str(int(loans_json["data"][i]["price"]))
                    loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #36
0
def crawl():
    company_id = 8
    url = "http://www.eloancn.com/new/loadAllTender.action?page=3&sidx=progress&sord=desc"
    request_headers = {'Referee': "http://www.eloancn.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        for p in range(1, 4):
            url = "http://www.eloancn.com/new/loadAllTender.action?page=%s" % p
            logger.info("page url:%s", url)
            # 这个页面比较恶心,一个标的的属性不在一个div内
            loan_htm = download_page(url, request_headers)
            loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
            htm_1 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd300 pdl10 fl']")
            htm_2 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd140 fl']")
            htm_3 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd130 fl pdl10']")
            htm_4 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd130 fl']")

            loan_list = []
            for h1 in htm_1:
                loan_obj = Loan(company_id)
                loan_obj.title = str(h1.xpath("h3/a[@class='fl']/text()")[0].encode("utf-8"))
                loan_obj.href = str(h1.xpath("h3/a[@class='fl']/@href")[0]).replace(":80", "")
                loan_obj.original_id = loan_obj.href.split("=")[1]
                loan_list.append(loan_obj)
            for index, h2 in enumerate(htm_2):
                loan_list[index].borrow_amount = str(h2.xpath("p[@class='colorCb mt10']/text()")[0].encode("utf-8")).replace("¥","").replace(",","")
                loan_list[index].rate = str(h2.xpath("p[@class='colorE6']/span/text()")[0]).replace("%", "")
            for index, h3 in enumerate(htm_3):
                loan_list[index].period = str(h3.xpath("p/span/text()")[0].encode("utf-8"))
                loan_list[index].period_unit = loan_obj.PERIOD_UNIT_MONTH
                loan_list[index].repayment = str(h3.xpath("p[@class='']/text()")[0].encode("utf-8"))
            for index, h4 in enumerate(htm_4):
                loan_list[index].schedule = str(h4.xpath("p/span/em/text()")[0]).strip().replace("%", "")

            # 去掉已经满标的
            new_list = [i for i in loan_list if i.schedule != "100"]

            for loan in new_list:
                online_ids_set.add(loan.original_id)
                if loan.original_id in db_ids_set:
                    update_ids_set.add(loan.original_id)

                    loan.db_update(db)
                else:
                    new_ids_set.add(loan.original_id)

                    loan.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

            time.sleep(5)

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj_off = Loan(company_id)
            loan_obj_off.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))


    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #37
0
def crawl():
    company_id = 15
    url = "https://www.iqianbang.com/invest"
    request_headers = {"Referee": "https://www.iqianbang.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("td[7]/text()")[0].encode("utf-8")).strip() != "融资中":
                    continue
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("-")[3].replace(".shtml", "")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.iqianbang.com" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = (
                        str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace(",", "").replace("元", "")
                    )
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = (
                        str(loan.xpath("td[2]/span/span/text()")[0].encode("utf-8")).strip().replace("%", "")
                    )
                    period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")

                    # 这里需要进入详情页
                    loan_info_htm = download_page(loan_obj.href, headers={"Referee": url, "User-Agent": DEFAULT_UA})
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8")
                    loan_obj.repayment = str(
                        loan_info_htm_parse.xpath("//div[@class='inright']/table[@class='idetable']")[0]
                        .xpath("tr[2]/td[2]/span/text()")[0]
                        .encode("utf-8")
                    ).strip()

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
class Cleaner():

    account = Account()
    card = Card()
    client = Client()
    disp = Disp()
    district = District()
    loan = Loan()
    order = Order()
    trans = Trans()

    def cleanData(self):
        print("Error in district:")
        self.district.clean()
        self.district.output("cleaned_district.csv")
        print("Error in account:")
        self.account.clean(self.district.cur)
        print("Error in client:")
        self.client.clean(self.district.cur)
        print("Error in disp:")
        self.disp.clean(self.client.cur, self.account.cur)
        dat = []
        i = 0
        j = 0
        k = 0
        a_len = len(self.account.cleaned_data)
        d_len = len(self.disp.cleaned_data)
        c_len = len(self.client.cleaned_data)
        while True:
            a = self.account.cleaned_data[i]
            d = self.disp.cleaned_data[j]
            c = self.client.cleaned_data[k]
            a_a_id = a[0]
            c_c_id = c[0]
            d_a_id = d[2]
            d_c_id = d[1]
            if a_a_id != d_a_id:
                while a_a_id < d_a_id:
                    account_id, district_id, frequency, date, time = a
                    g = 'unknown'
                    t = 'UNKNOWN'
                    dat.append(
                        [account_id, district_id, frequency, date, time, g, t])
                    i = i + 1
                    a = self.account.cleaned_data[i]
                    a_a_id = a[0]
                if a_a_id > d_a_id:
                    print("Sort error")
                    break
            if (c_c_id != d_c_id):
                print("Sort error")
                break
            g = c[2]
            t = d[3]
            if (j != d_len - 1) and (k != c_len - 1):
                _d = self.disp.cleaned_data[j + 1]
                _c = self.client.cleaned_data[k + 1]
                _d_a_id = _d[2]
                if d_a_id == _d_a_id:
                    _d_c_id = _d[1]
                    _c_c_id = _c[0]
                    if _c_c_id != _d_c_id:
                        print("Sort error")
                        break
                    _g = _c[2]
                    _t = _d[3]
                    if g != _g:
                        g = 'couple'
                    if t != _t:
                        t = 'DOUBLE'
                    j = j + 1
                    k = k + 1
            account_id, district_id, frequency, date, time = a
            dat.append([account_id, district_id, frequency, date, time, g, t])
            i = i + 1
            j = j + 1
            k = k + 1
            if (i == a_len) or (j == d_len) or (k == c_len):
                break
        self.account.cleaned_data = dat
        self.account.output("cleaned_account.csv")
        self.client.output("cleaned_client.csv")
        self.disp.output("cleaned_disp.csv")
        print("Error in card:")
        self.card.clean(self.disp.cur)
        self.card.output("cleaned_card.csv")
        print("Error in loan:")
        self.loan.clean(self.account.cur)
        self.loan.output("cleaned_loan.csv")
        print("Error in order:")
        self.order.clean(self.account.cur)
        self.order.output("cleaned_order.csv")
        print("Error in trans:")
        self.trans.clean(self.account.cur)
        self.trans.output("cleaned_trans.csv")
Beispiel #39
0
def crawl():
    company_id = 2
    url = "http://www.ppdai.com/lend/12_s1_p1"
    request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='fen_ye_nav']/table/tr/td[last()]/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")

        for p in range(1, int(page) + 1):
            url = "http://www.ppdai.com/lend/12_s1_p" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath("//div[@class='lend_nav']/table/tr")
            if len(loans) > 0:
                for loan in loans:
                    if lxml.html.tostring(loan).find("tit_nav") > 0:
                        continue
                    href = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@href")[0])
                    original_id = href.split("/")[2].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1]
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id)
                        loan_obj.original_id = original_id
                        loan_obj.href = "http://www.ppdai.com" + href
                        loan_obj.title = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@title")[0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(loan.xpath("td[4]/text()")[0]).strip().replace("%", "")
                        period = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "")
                        if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                            loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        else:
                            loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                        loan_obj.schedule = float(str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1])

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def process_input(avalue, lface, lrate, lterm):
    a = Asset(float(avalue))
    l = Loan(int(lterm), float(lrate), float(lface), a)
    return l
Beispiel #41
0
def crawl():
    company_id = 10
    url = "https://www.xinhehui.com/Financing/Invest/ajaxplist"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//table[@class='ui-record-table percentTable mt10']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if loan.xpath("td[last()]/a/@href")[0].encode("utf-8") == "javascript:;":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8"))
                original_id = href.split("id%3D")[1].encode("utf-8").strip()
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.xinhehui.com" + href
                    title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip()
                    if loan.xpath("td[1]/p[1]/a/em"):
                        title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip()
                    else:
                        title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.title = title_1 + title_2
                    borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "")
                    if borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000
                    else:
                        loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", ""))

                    if loan.xpath("td[4]/span"):
                        period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip()
                    else:
                        period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "")
                    loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #42
0
def crawl():
    company_id = 14
    url = "http://www.licaifan.com"
    request_headers = {'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//ul[@class='main-list tab-con2']/li[1]/table/tr")
        if len(loans) > 0:
            # 这里注意第一行是表单标题,不需要,所以从1开始
            for i in range(1, len(loans)):
                if str(loans[i].xpath("td[last()]/a/text()")[0].encode("utf-8")) == "投资满额":
                    continue
                href = str(loans[i].xpath("td[1]/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.licaifan.com" + href
                    loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = str(loans[i].xpath("td[2]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    period = str(loans[i].xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #43
0
def crawl():
    company_id = 17
    # url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512"
    s = int(time.time() * 1000)
    e = s + 1
    url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d"
    url_1 = url % (0, s, e)
    request_headers = {"Referee": "http://www.touna.cn/invest-list.html", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        htm = download_page(url_1, request_headers)
        htm_json = loads(htm, encoding="UTF-8")
        page_count = htm_json["result"]["pages"]["count"]
        page = page_count / 10
        if page_count % 10 > 0:
            page += 1
        if page > 0:
            for p in range(0, page):
                # 重新计算当前时间
                s = int(time.time() * 1000)
                e = s + 1
                page_url = url % (p, s, e)
                loan_htm = download_page(page_url, request_headers)
                loans_json = loads(loan_htm, encoding="UTF-8")
                loans = loans_json["result"]["list"]
                for loan in loans:
                    original_id = str(loan["id"])
                    if original_id:
                        online_ids_set.add(original_id)
                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int(original_id)
                        loan_obj.title = loan["name"]
                        loan_obj.borrow_amount = loan["account"]
                        loan_obj.rate = loan["apr"]
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.repayment = loan["style_name"]
                        period = str(loan["time_limit_name"].encode("utf-8"))
                        if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                            loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        else:
                            loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

            # db - 新抓取的 = 就是要下线的
            off_ids_set = db_ids_set - online_ids_set
            if off_ids_set:
                loan_obj = Loan(company_id)
                loan_obj.db_offline(db, off_ids_set)
                logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #44
0
def crawl():
    company_id = 11
    url = "https://www.tzydb.com"
    request_headers = {'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//div[@id='proList']/ul[@class='item_li']")
        if len(loans) > 0:
            for loan in loans:
                schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip()
                if schedule == "100%" or schedule == "100.0%":
                    #放弃已经结束的
                    continue
                # link = https://www.tzydb.com/boot/lookup/971,1017
                a_script = str(loan.xpath("li/div[1]/div[1]/div/a/@href")[0].encode("utf-8"))
                o_id = ID_RE.findall(a_script)[0]
                original_id = o_id.replace(",", "-")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)
                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.tzydb.com/boot/lookup/" + o_id
                    loan_obj.title = str(loan.xpath("li/div[1]/div[1]/div/a/text()")[0].encode("utf-8"))
                    loan_obj.borrow_amount = str(loan.xpath("li/div[2]/div[1]/span/text()")[0].encode("utf-8")).strip()\
                        .replace(" ", "").replace(",", "")
                    loan_obj.period = str(loan.xpath("li/div[2]/div[3]/span/text()")[0].encode("UTF-8")).strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.rate = str(loan.xpath("li/div[2]/div[2]/span/text()")[0]).strip().replace("%", "")
                    loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #45
0
def crawl():
    company_id = 25
    url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \
          "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \
          "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page((url % 1), request_headers)
        obj = loads(htm, encoding="utf-8")
        total = int(obj["total"])
        if total > 0:
            page = total / 5
            if total % 5 > 0:
                page += 1
            for p in range(1, page+1):
                htm = download_page((url % p), request_headers)
                htm_obj = loads(htm, encoding="utf-8")
                loans = htm_obj["projectList"]
                for loan in loans:
                    original_id = loan["ID"]
                    href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = href
                        loan_obj.title = loan["Title"]
                        loan_obj.borrow_amount = loan["TotalAmount"]
                        loan_obj.rate = loan["YearRate"]
                        loan_obj.period = loan["Deadline"]
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.repayment = loan["RepaymentTypeDesc"]

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #46
0
def crawl():
    company_id = 16
    # url = "http://www.itouzi.com/dinvest/invest/index"
    url = "http://www.itouzi.com/dinvest/debt/index"
    request_headers = {"Referee": "http://www.itouzi.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        # 注意ul的class后面有个空格
        loans = loan_htm_parse.xpath("//ul[@class='invest-product-case-list mtn btn clearfix ']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath("div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']"):
                    continue
                href = str(loan.xpath("h2/a[@class='fl']/@href")[0])
                original_id = href.split("id=")[1]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    # loan_obj = Loan(company_id, original_id)
                    # loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    # loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.itouzi.com" + href
                    loan_obj.title = str(loan.xpath("h2/a[@class='fl']/text()")[0].encode("utf-8")).strip()
                    loan_obj.repayment = (
                        str(loan.xpath("p/span[2]/text()")[0].encode("utf-8")).strip().replace("还款方式:", "")
                    )
                    loan_obj.borrow_amount = int(loan.xpath("p/span[3]/strong/text()")[0]) * 10000

                    loan_obj.rate = (
                        str(loan.xpath("p/span[5]/em[1]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    )
                    period = str(loan.xpath("p/span[4]/strong/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    # 这个进度这块还不确定,需等有标时检查一遍
                    if loan.xpath("div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']"):
                        loan_obj.schedule = (
                            str(
                                loan.xpath(
                                    "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()"
                                )[0].encode("utf-8")
                            )
                            .strip()
                            .replace("%", "")
                        )
                        print loan_obj.schedule
                    # loan_obj.db_create(db)
        #
        #    logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))
        #
        ## db - 新抓取的 = 就是要下线的
        # off_ids_set = db_ids_set - online_ids_set
        # if off_ids_set:
        #    loan_obj = Loan(company_id)
        #    loan_obj.db_offline(db, off_ids_set)
        #    logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #47
0
def crawl():
    company_id = 24
    url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = loads(htm, encoding="utf-8")
        loans = htm_obj["list"]
        if len(loans) > 0:
            for loan in loans:
                if str(loan["bID_SCHD"]) == "100":
                    #放弃已经结束的
                    continue
                original_id = loan["lN_NO"]
                href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = loan["lN_NM"]
                    loan_obj.borrow_amount = loan["lN_AMT"]
                    loan_obj.rate = loan["lN_RATE"]
                    loan_obj.period = loan["lN_TERM"]
                    loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"]
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.repayment = loan["pAY_METH_DESC"]

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #48
0
def crawl():
    company_id = 3
    url = "http://www.91wangcai.com/invest/index.html"
    request_headers = {
        'Referee': "http://www.91wangcai.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="gb2312")
        loans = loan_htm_parse.xpath("//div[@class='proBoxNew']")

        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='hd']/a/@href")[0])
                original_id = href.split(".")[0].split("/")[2].encode("utf-8")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.91wangcai.com" + href
                    loan_obj.title = autodecode(
                        str(
                            loan.xpath("div[@class='hd']/a/text()")[0].encode(
                                "gb2312"))).encode("utf-8")

                    loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("¥", "")

                    loan_obj.rate = str(
                        loan.xpath(
                            "div[@class='bd']/table/tr[1]/td[2]/em/text()")
                        [0]).strip().replace("%", "")

                    loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \
                        .replace("<em>", "").replace("</em>", "")
                    html_parser = HTMLParser.HTMLParser()
                    period = html_parser.unescape(loan_period_text).encode(
                        "utf-8").strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("还款方式:", "")

                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #49
0
def crawl():
    company_id = 3
    url = "http://www.91wangcai.com/invest/index.html"
    request_headers = {'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="gb2312")
        loans = loan_htm_parse.xpath("//div[@class='proBoxNew']")

        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='hd']/a/@href")[0])
                original_id = href.split(".")[0].split("/")[2].encode("utf-8")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.91wangcai.com" + href
                    loan_obj.title = autodecode(str(loan.xpath("div[@class='hd']/a/text()")[0].encode("gb2312"))).encode("utf-8")

                    loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("¥", "")

                    loan_obj.rate = str(loan.xpath("div[@class='bd']/table/tr[1]/td[2]/em/text()")[0]).strip().replace("%", "")

                    loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \
                        .replace("<em>", "").replace("</em>", "")
                    html_parser = HTMLParser.HTMLParser()
                    period = html_parser.unescape(loan_period_text).encode("utf-8").strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("还款方式:", "")

                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #50
0
def crawl():
    company_id = 10
    url = "https://www.xinhehui.com/Financing/Invest/ajaxplist"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath(
            "//table[@class='ui-record-table percentTable mt10']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if loan.xpath("td[last()]/a/@href")[0].encode(
                        "utf-8") == "javascript:;":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8"))
                original_id = href.split("id%3D")[1].encode("utf-8").strip()
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(
                            loan.xpath("td[7]/div/a/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.xinhehui.com" + href
                    title_1 = str(
                        loan.xpath("td[1]/p[1]/a/text()")[0].encode(
                            "utf-8")).strip()
                    if loan.xpath("td[1]/p[1]/a/em"):
                        title_2 = str(
                            loan.xpath("td[1]/p[1]/a/em/text()")[0].encode(
                                "utf-8")).strip()
                    else:
                        title_2 = str(
                            loan.xpath("td[1]/p[1]/a/span/text()")[0].encode(
                                "utf-8")).strip()
                    loan_obj.title = title_1 + title_2
                    borrow_amount = str(
                        loan.xpath("td[2]/span/text()")[0].encode(
                            "utf-8")).strip().replace(" ", "")
                    if borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = float(
                            borrow_amount.replace("万", "")) * 10000
                    else:
                        loan_obj.borrow_amount = float(
                            borrow_amount.replace("元", "").replace(",", ""))

                    if loan.xpath("td[4]/span"):
                        period = str(
                            loan.xpath("td[4]/span/@title")[0].encode(
                                "UTF-8")).strip()
                    else:
                        period = str(
                            loan.xpath("td[4]/text()")[0].encode(
                                "UTF-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.rate = str(
                        loan.xpath("td[3]/p/text()")[0]).strip().replace(
                            "%", "")
                    loan_obj.repayment = str(
                        loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(
                            loan.xpath("td[7]/div/a/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #51
0
def crawl():
    company_id = 26
    url = "http://www.longlongweb.com/invests"
    request_headers = {"Referee": "http://www.longlongweb.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="utf-8")
        loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl")
        if len(loans) > 0:
            for loan in loans:
                if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0:
                    continue

                href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0])
                original_id = href.split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = (
                        str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )

                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = (
                        str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8"))
                    loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).replace(
                        "%", ""
                    )
                    loan_obj.borrow_amount = (
                        str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = (
                        str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.repayment = (
                        str(
                            loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0].encode(
                                "utf-8"
                            )
                        )
                        .strip()
                        .replace("还款方式:", "")
                    )

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))
    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())