Exemple #1
0
def main():


    print ('========== Exersize 2.1.4 ==========\n')

    print('Testing Loan classmethod for monthly payment : {}'.format(Loan.calcMonthlyPmt(100000, .025, 360)))
    print('Testing Loan classmethod for balance: {}'.format(Loan.calcBal(100000, .025, 360, 60)))

    myLoan = Loan(360,.025,100000)
    print("Monthly Payment: {}".format(myLoan.monthlyPayment()))
    print("Balance after 60 periods: {}".format(myLoan.balance(60)))
    print("Interest due on period 60: {}".format(myLoan.interestDue(60)))
    print("Principal due on period 60: {}".format(myLoan.principlaDue(60)))

    print("The total payment should equal interest plus principal which is {}".format(myLoan.interestDue(5) + myLoan.principlaDue(5)))
    print("Total Interest paid is {}".format(myLoan.totalInterest()))
    print("Total Payment is {}".format(myLoan.totalPayments()))
    print("Total Interest paid is {}".format(myLoan.totalInterest()))

    """
    The benefit of the cls level method is that it allows us to compute a payment or balance with out initiating an object
    """

    print("Old rate {}".format(myLoan.rate))
    print("Old term {}".format(myLoan.term))
    print("Old face {}".format(myLoan.face))

    myLoan.rate = .035
    myLoan.term = 60
    myLoan.face = 20000

    print("New rate {}".format(myLoan.rate))
    print("New term {}".format(myLoan.term))
    print("New face {}".format(myLoan.face))
Exemple #2
0
def main():
    print ('========== Exersize 2.1.3 ==========\n')
    myLoan = Loan(360,.025,100000)
    print("Monthly Payment: {}".format(myLoan.monthlyPayment()))
    t = timer()
    t.start()
    print("Balance after 60 periods: {}".format(myLoan.balance(60)))
    t.end()
    t.start()
    print('Balance in period 60 computed recursivly {}'.format(myLoan.balanceRecursive(60, myLoan.face)))
    t.end()
    t.start()
    print("Interest due on period 60: {}".format(myLoan.interestDue(60)))
    t.end()
    t.start()
    print('Interest in period 60 computed recursivly {}'.format(myLoan.interestDueRecursive(60, myLoan.face)))
    t.end()
    t.start()
    print("Principal due on period 60: {}".format(myLoan.principlaDue(60)))
    t.end()
    t.start()
    print('Principal in period 60 computed recursivly {}'.format(myLoan.principalDueRecursive(60, myLoan.face)))
    t.end()

    """
    On my system, in both instances the direct and recursive versions of the function run to fast to comeup with a time besides 0
    However, I know that the recursive function is likely much slower
    """

    print("The total payment should equal interest plus principal which is {}".format(myLoan.interestDue(5) + myLoan.principlaDue(5)))
    print("Total Interest paid is {}".format(myLoan.totalInterest()))
    print("Total Payment is {}".format(myLoan.totalPayments()))
    print("Total Interest paid is {}".format(myLoan.totalInterest()))

    print("Old rate {}".format(myLoan.rate))
    print("Old term {}".format(myLoan.term))
    print("Old face {}".format(myLoan.face))

    myLoan.rate = .035
    myLoan.term = 60
    myLoan.face = 20000

    print("New rate {}".format(myLoan.rate))
    print("New term {}".format(myLoan.term))
    print("New face {}".format(myLoan.face))
Exemple #3
0
def main():

    print('========== Exercise 2.1.2 ==========')
    myLoan = Loan(360, .025, 100000)
    print("Monthly Payment: {}".format(myLoan.monthlyPayment()))
    print("Balance after 360 periods: {}".format(myLoan.balance(360)))
    print("Interest due on period 360: {}".format(myLoan.interestDue(360)))
    print("Principal due on period 360: {}".format(myLoan.principlaDue(360)))
    print("The total payment should equal interest plus principal which is {}".
          format(myLoan.interestDue(5) + myLoan.principlaDue(5)))
    print("Total Interest paid is {}".format(myLoan.totalInterest()))
    print("Total Payment is {}".format(myLoan.totalPayments()))

    print("Old rate {}".format(myLoan.rate))
    print("Old term {}".format(myLoan.term))
    print("Old face {}".format(myLoan.face))

    myLoan.rate = .035
    myLoan.term = 60
    myLoan.face = 20000

    print("New rate {}".format(myLoan.rate))
    print("New term {}".format(myLoan.term))
    print("New face {}".format(myLoan.face))
Exemple #4
0
def main():
    print('========== Exersize 2.1.5 ==========\n')
    print('Testing static method for monthly rate : {}'.format(
        Loan.monthlyRate(.025)))
    print('Testing testing static method for annual rate: {}'.format(
        Loan.annualRate(.01)))
    """
        The benefit of the static method is that neither the class nor the instance is passed in.  This allows us 
        to include functions that may be useful for the class, but do not directly rely on information in the class.
        From an organizational standpoint this makes it easier to organize the code.  For example, the rate conversion is
        not logically part of a loan object, however the loan class is where it makes most sense to include it.
    """

    myLoan = Loan(360, .025, 100000)
    print("Monthly Payment: {}".format(myLoan.monthlyPayment()))
    print("Balance after 60 periods: {}".format(myLoan.balance(60)))
    print("Interest due on period 60: {}".format(myLoan.interestDue(60)))
    print("Principal due on period 60: {}".format(myLoan.principlaDue(60)))

    print("The total payment should equal interest plus principal which is {}".
          format(myLoan.interestDue(5) + myLoan.principlaDue(5)))
    print("Total Interest paid is {}".format(myLoan.totalInterest()))
    print("Total Payment is {}".format(myLoan.totalPayments()))
    print("Total Interest paid is {}".format(myLoan.totalInterest()))

    print("Old rate {}".format(myLoan.rate))
    print("Old term {}".format(myLoan.term))
    print("Old face {}".format(myLoan.face))

    myLoan.rate = .035
    myLoan.term = 60
    myLoan.face = 20000

    print("New rate {}".format(myLoan.rate))
    print("New term {}".format(myLoan.term))
    print("New face {}".format(myLoan.face))
Exemple #5
0
def crawl():
    company_id = 9
    url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1"
    request_headers = {
        'Referee': "https://list.lufax.com/list/listing/fuying",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        loan_num = loans_json["totalCount"]
        if loans_json and loan_num:
            for i in range(0, loan_num):
                original_id = str(loans_json["data"][i]["productId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(
                        float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(
                        int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id
                    loan_obj.title = loans_json["data"][i][
                        "productNameDisplay"]
                    loan_obj.rate = str(
                        float(loans_json["data"][i]["interestRate"]) * 100)
                    period = str(loans_json["data"][i]
                                 ["investPeriodDisplay"].encode("utf-8"))
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.repayment = loans_json["data"][i][
                        "collectionModeDisplay"]
                    loan_obj.borrow_amount = str(
                        int(loans_json["data"][i]["price"]))
                    loan_obj.schedule = str(
                        float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(
                        int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #6
0
def crawl():
    company_id = 23
    url = "https://member.niwodai.com/xiangmu/"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm, encoding="utf-8")
        loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\
            .replace("共", "").replace("个标", "").strip())
        if loan_size > 0:
            page = loan_size / 10
            if loan_size % 10 > 0:
                page += 1
            for p in range(1, page + 1):
                page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % (
                    p, loan_size)
                page_html = download_page(page_url, request_headers)
                page_obj = parse_html(page_html, encoding="utf-8")
                loans = page_obj.xpath(
                    "//div[@class='biaoList']/table/tbody/tr")
                for loan in loans:
                    if lxml.html.tostring(loan).find("<th>") > 0:
                        continue
                    href = str(loan.xpath("td[1]/a/@href")[0])
                    original_id = href.replace(".html", "").split("/")[2]
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(
                            loan.xpath("td[5]/text()")[0].encode(
                                "utf-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = REFEREE + href
                        loan_obj.title = str(
                            loan.xpath("td[1]/a/text()")[0].encode(
                                "utf-8")).strip()
                        loan_obj.borrow_amount = str(
                            loan.xpath("td[4]/em/text()")[0].encode(
                                "utf-8")).strip().replace(",", "")
                        loan_obj.rate = str(
                            loan.xpath("td[2]/em/text()")[0].encode(
                                "utf-8")).strip().replace("%", "")
                        loan_obj.period = str(
                            loan.xpath("td[3]/em/text()")[0].encode(
                                "utf-8")).strip()
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        loan_obj.schedule = str(
                            loan.xpath("td[5]/text()")[0].encode(
                                "utf-8")).strip().replace("%", "")

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #7
0
def crawl():
    company_id = 17
    # url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512"
    s = int(time.time() * 1000)
    e = s + 1
    url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d"
    url_1 = url % (0, s, e)
    request_headers = {"Referee": "http://www.touna.cn/invest-list.html", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        htm = download_page(url_1, request_headers)
        htm_json = loads(htm, encoding="UTF-8")
        page_count = htm_json["result"]["pages"]["count"]
        page = page_count / 10
        if page_count % 10 > 0:
            page += 1
        if page > 0:
            for p in range(0, page):
                # 重新计算当前时间
                s = int(time.time() * 1000)
                e = s + 1
                page_url = url % (p, s, e)
                loan_htm = download_page(page_url, request_headers)
                loans_json = loads(loan_htm, encoding="UTF-8")
                loans = loans_json["result"]["list"]
                for loan in loans:
                    original_id = str(loan["id"])
                    if original_id:
                        online_ids_set.add(original_id)
                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int(original_id)
                        loan_obj.title = loan["name"]
                        loan_obj.borrow_amount = loan["account"]
                        loan_obj.rate = loan["apr"]
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.repayment = loan["style_name"]
                        period = str(loan["time_limit_name"].encode("utf-8"))
                        if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                            loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        else:
                            loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

            # db - 新抓取的 = 就是要下线的
            off_ids_set = db_ids_set - online_ids_set
            if off_ids_set:
                loan_obj = Loan(company_id)
                loan_obj.db_offline(db, off_ids_set)
                logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #8
0
def crawl():
    company_id = 16
    #url = "http://www.itouzi.com/dinvest/invest/index"
    url = "http://www.itouzi.com/dinvest/debt/index"
    request_headers = {
        'Referee': "http://www.itouzi.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        # 注意ul的class后面有个空格
        loans = loan_htm_parse.xpath(
            "//ul[@class='invest-product-case-list mtn btn clearfix ']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath(
                        "div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']"
                ):
                    continue
                href = str(loan.xpath("h2/a[@class='fl']/@href")[0])
                original_id = href.split("id=")[1]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    #loan_obj = Loan(company_id, original_id)
                    #loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    #loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.itouzi.com" + href
                    loan_obj.title = str(
                        loan.xpath("h2/a[@class='fl']/text()")[0].encode(
                            "utf-8")).strip()
                    loan_obj.repayment = str(loan.xpath("p/span[2]/text()")[0].encode("utf-8"))\
                        .strip().replace("还款方式:", "")
                    loan_obj.borrow_amount = int(
                        loan.xpath("p/span[3]/strong/text()")[0]) * 10000

                    loan_obj.rate = str(
                        loan.xpath("p/span[5]/em[1]/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")
                    period = str(
                        loan.xpath("p/span[4]/strong/text()")[0].encode(
                            "utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    # 这个进度这块还不确定,需等有标时检查一遍
                    if loan.xpath(
                            "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']"
                    ):
                        loan_obj.schedule = str(
                            loan.xpath(
                                "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()"
                            )[0].encode("utf-8")).strip().replace("%", "")
                        print loan_obj.schedule
                    #loan_obj.db_create(db)
        #
        #    logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))
        #
        ## db - 新抓取的 = 就是要下线的
        #off_ids_set = db_ids_set - online_ids_set
        #if off_ids_set:
        #    loan_obj = Loan(company_id)
        #    loan_obj.db_offline(db, off_ids_set)
        #    logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #9
0
def crawl():
    company_id = 3
    url = "http://www.91wangcai.com/invest/index.html"
    request_headers = {'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="gb2312")
        loans = loan_htm_parse.xpath("//div[@class='proBoxNew']")

        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='hd']/a/@href")[0])
                original_id = href.split(".")[0].split("/")[2].encode("utf-8")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.91wangcai.com" + href
                    loan_obj.title = autodecode(str(loan.xpath("div[@class='hd']/a/text()")[0].encode("gb2312"))).encode("utf-8")

                    loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("¥", "")

                    loan_obj.rate = str(loan.xpath("div[@class='bd']/table/tr[1]/td[2]/em/text()")[0]).strip().replace("%", "")

                    loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \
                        .replace("<em>", "").replace("</em>", "")
                    html_parser = HTMLParser.HTMLParser()
                    period = html_parser.unescape(loan_period_text).encode("utf-8").strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("还款方式:", "")

                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #10
0
def crawl():
    company_id = 12
    url = "http://www.renrendai.com/lend/loanList.action"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans_script = htm_obj.xpath("//script[@id='loan-list-rsp']/text()")[0].encode("utf-8")
        loans_json = loads(loans_script, encoding="UTF-8")
        loan_size = len(loans_json["data"]["loans"])
        if loan_size > 0:
            for i in range(0, loan_size):
                if loans_json["data"]["loans"][i]["status"] != "OPEN":
                    #放弃已经结束的
                    continue
                original_id = str(int(loans_json["data"]["loans"][i]["loanId"]))
                href = "http://www.renrendai.com/lend/detailPage.action?loanId=%s" % original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0]
                    loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"]))
                    loan_obj.db_update(db)
                else:
                    pass
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = str(loans_json["data"]["loans"][i]["title"].encode("utf-8"))
                    loan_obj.borrow_amount = str(loans_json["data"]["loans"][i]["amount"])
                    loan_obj.period = str(int(loans_json["data"]["loans"][i]["months"]))
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.rate = str(loans_json["data"]["loans"][i]["interest"])
                    loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"]))
                    loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0]
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #11
0
def crawl():
    company_id = 22
    url = "https://www.weidai.com.cn/?m=Biao&t=today&pageIndex=1&pageSize=8&sortField=b.verify_time&sortOrder=desc&data=null"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = loads(htm, encoding="utf-8")
        loans = htm_obj["data"]
        if len(loans) > 0:
            for loan in loans:
                if str(loan["borrow_account_scale"]) == "100.00":
                    #放弃已经结束的
                    continue
                original_id = loan["uid"]
                href = "https://www.weidai.com.cn/page/borinfo.html?uid=" + original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan["borrow_account_scale"])
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = loan["name"]
                    loan_obj.borrow_amount = loan["account"]
                    loan_obj.rate = loan["borrow_apr"]
                    loan_obj.period = loan["borrow_period"]
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan["borrow_account_scale"])
                    loan_obj.cast = loan["borrow_account_yes"]
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #12
0
def crawl():
    company_id = 7
    url = "http://www.jimubox.com/Project/List?status=1"
    request_headers = {'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='row']/div[@class='span3 project-card']")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/@href")[0])
                if not href.find("Index") > 0:
                    continue
                original_id = href.split("/")[3].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.jimubox.com" + href
                    loan_obj.title = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/text()")[0].encode("utf-8"))
                    loan_obj.description = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\
                        .strip() + "0000"
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")

                    rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    if rate.find("+") > 0:
                        rate_list = rate.split("+")
                        loan_obj.rate = str(float(rate_list[0]) + float(rate_list[1]))
                    else:
                        loan_obj.rate = rate
                    loan_obj.repayment = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h6/span/text()")[0].encode("utf-8"))
                    loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #13
0
def crawl():
    company_id = 19
    url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0"
    request_headers = {'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")

        if loans_json["list"]:
            for i in range(0, len(loans_json["list"])):
                if int(loans_json["list"][i]["status"]) != 1:
                    continue
                original_id = str(loans_json["list"][i]["borrowId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id
                    loan_obj.title = loans_json["list"][i]["name"]
                    loan_obj.rate = str(loans_json["list"][i]["apr"])
                    loan_obj.period = str(loans_json["list"][i]["totalPeriod"])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.borrow_amount = str(int(loans_json["list"][i]["account"]))
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #14
0
def crawl():
    company_id = 23
    url = "https://member.niwodai.com/xiangmu/"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm, encoding="utf-8")
        loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\
            .replace("共", "").replace("个标", "").strip())
        if loan_size > 0:
            page = loan_size / 10
            if loan_size % 10 > 0:
                page += 1
            for p in range(1, page+1):
                page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % (p, loan_size)
                page_html = download_page(page_url, request_headers)
                page_obj = parse_html(page_html, encoding="utf-8")
                loans = page_obj.xpath("//div[@class='biaoList']/table/tbody/tr")
                for loan in loans:
                    if lxml.html.tostring(loan).find("<th>") > 0:
                        continue
                    href = str(loan.xpath("td[1]/a/@href")[0])
                    original_id = href.replace(".html", "").split("/")[2]
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = REFEREE + href
                        loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                        loan_obj.borrow_amount = str(loan.xpath("td[4]/em/text()")[0].encode("utf-8")).strip().replace(",", "")
                        loan_obj.rate = str(loan.xpath("td[2]/em/text()")[0].encode("utf-8")).strip().replace("%", "")
                        loan_obj.period = str(loan.xpath("td[3]/em/text()")[0].encode("utf-8")).strip()
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "")

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #15
0
def crawl():
    company_id = 9
    url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1"
    request_headers = {'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        loan_num = loans_json["totalCount"]
        if loans_json and loan_num:
            for i in range(0, loan_num):
                original_id = str(loans_json["data"][i]["productId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id
                    loan_obj.title = loans_json["data"][i]["productNameDisplay"]
                    loan_obj.rate = str(float(loans_json["data"][i]["interestRate"]) * 100)
                    period = str(loans_json["data"][i]["investPeriodDisplay"].encode("utf-8"))
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.repayment = loans_json["data"][i]["collectionModeDisplay"]
                    loan_obj.borrow_amount = str(int(loans_json["data"][i]["price"]))
                    loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #16
0
def crawl():
    company_id = 24
    url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = loads(htm, encoding="utf-8")
        loans = htm_obj["list"]
        if len(loans) > 0:
            for loan in loans:
                if str(loan["bID_SCHD"]) == "100":
                    #放弃已经结束的
                    continue
                original_id = loan["lN_NO"]
                href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = loan["lN_NM"]
                    loan_obj.borrow_amount = loan["lN_AMT"]
                    loan_obj.rate = loan["lN_RATE"]
                    loan_obj.period = loan["lN_TERM"]
                    loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"]
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.repayment = loan["pAY_METH_DESC"]

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #17
0
def crawl():
    company_id = 17
    #url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512"
    s = int(time.time() * 1000)
    e = s + 1
    url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d"
    url_1 = url % (0, s, e)
    request_headers = {
        'Referee': "http://www.touna.cn/invest-list.html",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url_1, request_headers)
        htm_json = loads(htm, encoding="UTF-8")
        page_count = htm_json["result"]["pages"]["count"]
        page = page_count / 10
        if page_count % 10 > 0:
            page += 1
        if page > 0:
            for p in range(0, page):
                # 重新计算当前时间
                s = int(time.time() * 1000)
                e = s + 1
                page_url = url % (p, s, e)
                loan_htm = download_page(page_url, request_headers)
                loans_json = loads(loan_htm, encoding="UTF-8")
                loans = loans_json["result"]["list"]
                for loan in loans:
                    original_id = str(loan["id"])
                    if original_id:
                        online_ids_set.add(original_id)
                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int(
                            original_id)
                        loan_obj.title = loan["name"]
                        loan_obj.borrow_amount = loan["account"]
                        loan_obj.rate = loan["apr"]
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.repayment = loan["style_name"]
                        period = str(loan["time_limit_name"].encode("utf-8"))
                        if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                            loan_obj.period = period.replace(
                                loan_obj.PERIOD_UNIT_DAY, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        else:
                            loan_obj.period = period.replace("个", "").replace(
                                loan_obj.PERIOD_UNIT_MONTH, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

            # db - 新抓取的 = 就是要下线的
            off_ids_set = db_ids_set - online_ids_set
            if off_ids_set:
                loan_obj = Loan(company_id)
                loan_obj.db_offline(db, off_ids_set)
                logger.info("company %s crawler loan: offline %s", company_id,
                            len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #18
0
def crawl():
    company_id = 10
    url = "https://www.xinhehui.com/Financing/Invest/ajaxplist"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//table[@class='ui-record-table percentTable mt10']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if loan.xpath("td[last()]/a/@href")[0].encode("utf-8") == "javascript:;":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8"))
                original_id = href.split("id%3D")[1].encode("utf-8").strip()
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.xinhehui.com" + href
                    title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip()
                    if loan.xpath("td[1]/p[1]/a/em"):
                        title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip()
                    else:
                        title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.title = title_1 + title_2
                    borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "")
                    if borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000
                    else:
                        loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", ""))

                    if loan.xpath("td[4]/span"):
                        period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip()
                    else:
                        period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "")
                    loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #19
0
def crawl():
    company_id = 15
    url = "https://www.iqianbang.com/invest"
    request_headers = {"Referee": "https://www.iqianbang.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("td[7]/text()")[0].encode("utf-8")).strip() != "融资中":
                    continue
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("-")[3].replace(".shtml", "")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.iqianbang.com" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = (
                        str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace(",", "").replace("元", "")
                    )
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = (
                        str(loan.xpath("td[2]/span/span/text()")[0].encode("utf-8")).strip().replace("%", "")
                    )
                    period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")

                    # 这里需要进入详情页
                    loan_info_htm = download_page(loan_obj.href, headers={"Referee": url, "User-Agent": DEFAULT_UA})
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8")
                    loan_obj.repayment = str(
                        loan_info_htm_parse.xpath("//div[@class='inright']/table[@class='idetable']")[0]
                        .xpath("tr[2]/td[2]/span/text()")[0]
                        .encode("utf-8")
                    ).strip()

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #20
0
def crawl():
    company_id = 13
    url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress="
    request_headers = {
        'Referee': "http://www.yirendai.com/loan/list/1",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//ul[@class='bidList']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath(
                        "div[2]/div[2]/div/div[@class='bid_empty_errortip']"):
                    continue
                href = str(loan.xpath("div[2]/div/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(
                        float(loan_obj.cast) / float(loan_obj.borrow_amount) *
                        100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.yirendai.com" + href
                    loan_obj.title = str(
                        loan.xpath("div[2]/div/h3/a/text()")[0].encode(
                            "utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.rate = str(
                        loan.xpath("div[2]/div/div[3]/h4/span/text()")
                        [0].encode("utf-8")).strip()
                    loan_obj.period = str(
                        loan.xpath("div[2]/div/div[4]/h4/span/text()")
                        [0].encode("utf-8")).strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(
                        float(loan_obj.cast) / float(loan_obj.borrow_amount) *
                        100)

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #21
0
def crawl():
    company_id = 26
    url = "http://www.longlongweb.com/invests"
    request_headers = {"Referee": "http://www.longlongweb.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="utf-8")
        loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl")
        if len(loans) > 0:
            for loan in loans:
                if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0:
                    continue

                href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0])
                original_id = href.split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = (
                        str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )

                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = (
                        str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8"))
                    loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).replace(
                        "%", ""
                    )
                    loan_obj.borrow_amount = (
                        str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = (
                        str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.repayment = (
                        str(
                            loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0].encode(
                                "utf-8"
                            )
                        )
                        .strip()
                        .replace("还款方式:", "")
                    )

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))
    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #22
0
def crawl():
    company_id = 20
    url = "http://www.xiaomabank.com/finance.do"
    request_headers = {"Referee": "http://www.xiaomabank.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = (
                        str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8"))
                        .replace("width:", "")
                        .strip()
                        .replace("%;", "")
                    )
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.xiaomabank.com/" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = (
                        str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "")
                    )
                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = (
                        str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8"))
                        .replace("width:", "")
                        .strip()
                        .replace("%;", "")
                    )

                    # 注意这里页面返回的gzip压缩后的,需要解压
                    resp = urllib2.urlopen(loan_obj.href)
                    respInfo = resp.info()
                    if ("Content-Encoding" in respInfo) and (respInfo["Content-Encoding"] == "gzip"):
                        respHtml = zlib.decompress(resp.read(), 16 + zlib.MAX_WBITS)
                        info_htm_parse = parse_html(respHtml, encoding="utf-8")
                        loan_obj.repayment = str(
                            info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8")
                        )

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #23
0
def crawl():
    company_id = 21
    url = "http://www.id68.cn/invest/index/borrow_status/9/p/1.html"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//ul[@class='ideal_con']/li")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]) == "100.00%":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("table/tr[1]/td[1]/a/@href")[0].encode("utf-8"))
                original_id = href.replace(".html", "").split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = REFEREE + href
                    loan_obj.title = str(loan.xpath("table/tr[1]/td[1]/a/@title")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("table/tr[2]/td[last()]/span/text()")[0].encode("utf-8"))\
                        .strip().replace(" ", "").replace(",", "")
                    loan_obj.repayment = str(loan.xpath("table/tr[2]/td[4]/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.rate = str(loan.xpath("table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("table/tr[2]/td[3]/span/text()")[0].encode("utf-8")).strip()\
                        .replace(" ", "").replace("个月", "")
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #24
0
def crawl():
    company_id = 14
    url = "http://www.licaifan.com"
    request_headers = {'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath(
            "//ul[@class='main-list tab-con2']/li[1]/table/tr")
        if len(loans) > 0:
            # 这里注意第一行是表单标题,不需要,所以从1开始
            for i in range(1, len(loans)):
                if str(loans[i].xpath("td[last()]/a/text()")[0].encode(
                        "utf-8")) == "投资满额":
                    continue
                href = str(loans[i].xpath("td[1]/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.licaifan.com" + href
                    loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()")
                                         [0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(
                            loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = str(loans[i].xpath("td[2]/text()")
                                        [0].encode("utf-8")).strip().replace(
                                            "%", "")
                    period = str(loans[i].xpath("td[4]/text()")[0].encode(
                        "utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #25
0
def crawl():
    company_id = 26
    url = "http://www.longlongweb.com/invests"
    request_headers = {'Referee': "http://www.longlongweb.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="utf-8")
        loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl")
        if len(loans) > 0:
            for loan in loans:
                if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0:
                    continue

                href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0])
                original_id = href.split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\
                        .replace(",", "").replace("¥", "").strip()

                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0]
                                        .encode("utf-8")).replace(",", "").replace("¥", "").strip()
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8"))
                    loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8"))\
                        .replace("%", "")
                    loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\
                        .replace(",", "").replace("¥", "").strip()
                    loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0]
                                        .encode("utf-8")).replace(",", "").replace("¥", "").strip()
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.repayment = str(loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0]
                                             .encode("utf-8")).strip().replace("还款方式:", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))
    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #26
0
def crawl():
    company_id = 20
    url = "http://www.xiaomabank.com/finance.do"
    request_headers = {'Referee': "http://www.xiaomabank.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.xiaomabank.com/" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "")
                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "")

                    # 注意这里页面返回的gzip压缩后的,需要解压
                    resp = urllib2.urlopen(loan_obj.href)
                    respInfo = resp.info()
                    if(("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")):
                        respHtml = zlib.decompress(resp.read(), 16+zlib.MAX_WBITS)
                        info_htm_parse = parse_html(respHtml, encoding="utf-8")
                        loan_obj.repayment = str(info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8"))

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #27
0
def crawl():
    company_id = 15
    url = "https://www.iqianbang.com/invest"
    request_headers = {
        'Referee': "https://www.iqianbang.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("td[7]/text()")[0].encode(
                        "utf-8")).strip() != "融资中":
                    continue
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("-")[3].replace(".shtml", "")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(
                        loan.xpath("td[6]/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.iqianbang.com" + href
                    loan_obj.title = str(
                        loan.xpath("td[1]/a/text()")[0].encode(
                            "utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "").replace("元", "")
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(
                            loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = str(
                        loan.xpath("td[2]/span/span/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")
                    period = str(
                        loan.xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(
                        loan.xpath("td[6]/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")

                    # 这里需要进入详情页
                    loan_info_htm = download_page(loan_obj.href,
                                                  headers={
                                                      'Referee': url,
                                                      'User-Agent': DEFAULT_UA
                                                  })
                    loan_info_htm_parse = parse_html(loan_info_htm,
                                                     encoding="UTF-8")
                    loan_obj.repayment = str(
                        loan_info_htm_parse.xpath(
                            "//div[@class='inright']/table[@class='idetable']")
                        [0].xpath("tr[2]/td[2]/span/text()")[0].encode(
                            "utf-8")).strip()

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #28
0
def crawl():
    company_id = 25
    url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \
          "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \
          "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page((url % 1), request_headers)
        obj = loads(htm, encoding="utf-8")
        total = int(obj["total"])
        if total > 0:
            page = total / 5
            if total % 5 > 0:
                page += 1
            for p in range(1, page+1):
                htm = download_page((url % p), request_headers)
                htm_obj = loads(htm, encoding="utf-8")
                loans = htm_obj["projectList"]
                for loan in loans:
                    original_id = loan["ID"]
                    href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = href
                        loan_obj.title = loan["Title"]
                        loan_obj.borrow_amount = loan["TotalAmount"]
                        loan_obj.rate = loan["YearRate"]
                        loan_obj.period = loan["Deadline"]
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.repayment = loan["RepaymentTypeDesc"]

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #29
0
def crawl():
    company_id = 4
    url = "https://www.yinhu.com/loan/loan_list.bl"
    request_headers = {'Referee': "https://www.yinhu.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()
    # offline
    off_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@id='loan_list']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/p/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")
                try:
                    loan_status = str(loan.xpath("td[last()]/em/span/text()")[0].encode("utf-8")).strip()
                except:
                    loan_status = str(loan.xpath("td[last()]/a/span/text()")[0].encode("utf-8")).strip()

                if original_id and loan_status != "还款中":
                    online_ids_set.add(original_id)

                if loan_status == "还款中" or loan_status == "满标":
                    if original_id in db_ids_set:
                        off_ids_set.add(original_id)
                    continue

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\
                        .strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.yinhu.com" + href
                    loan_obj.title = str(loan.xpath("td[1]/p/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip().replace(",", "")\
                        .replace("元", "")

                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip()
                    period = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\
                        .strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #30
0
def crawl():
    company_id = 25
    url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \
          "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \
          "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page((url % 1), request_headers)
        obj = loads(htm, encoding="utf-8")
        total = int(obj["total"])
        if total > 0:
            page = total / 5
            if total % 5 > 0:
                page += 1
            for p in range(1, page + 1):
                htm = download_page((url % p), request_headers)
                htm_obj = loads(htm, encoding="utf-8")
                loans = htm_obj["projectList"]
                for loan in loans:
                    original_id = loan["ID"]
                    href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = href
                        loan_obj.title = loan["Title"]
                        loan_obj.borrow_amount = loan["TotalAmount"]
                        loan_obj.rate = loan["YearRate"]
                        loan_obj.period = loan["Deadline"]
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.repayment = loan["RepaymentTypeDesc"]

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #31
0
def crawl():
    company_id = 19
    url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0"
    request_headers = {
        'Referee': "https://www.qian360.com/tl/select.html",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")

        if loans_json["list"]:
            for i in range(0, len(loans_json["list"])):
                if int(loans_json["list"][i]["status"]) != 1:
                    continue
                original_id = str(loans_json["list"][i]["borrowId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(
                        int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id
                    loan_obj.title = loans_json["list"][i]["name"]
                    loan_obj.rate = str(loans_json["list"][i]["apr"])
                    loan_obj.period = str(loans_json["list"][i]["totalPeriod"])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.borrow_amount = str(
                        int(loans_json["list"][i]["account"]))
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(
                        int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #32
0
def crawl():
    company_id = 13
    url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress="
    request_headers = {'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//ul[@class='bidList']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath("div[2]/div[2]/div/div[@class='bid_empty_errortip']"):
                    continue
                href = str(loan.xpath("div[2]/div/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.yirendai.com" + href
                    loan_obj.title = str(loan.xpath("div[2]/div/h3/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.rate = str(loan.xpath("div[2]/div/div[3]/h4/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.period = str(loan.xpath("div[2]/div/div[4]/h4/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #33
0
def crawl():
    company_id = 14
    url = "http://www.licaifan.com"
    request_headers = {'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//ul[@class='main-list tab-con2']/li[1]/table/tr")
        if len(loans) > 0:
            # 这里注意第一行是表单标题,不需要,所以从1开始
            for i in range(1, len(loans)):
                if str(loans[i].xpath("td[last()]/a/text()")[0].encode("utf-8")) == "投资满额":
                    continue
                href = str(loans[i].xpath("td[1]/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.licaifan.com" + href
                    loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = str(loans[i].xpath("td[2]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    period = str(loans[i].xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #34
0
def crawl():
    company_id = 2
    url = "http://www.ppdai.com/lend/12_s1_p1"
    request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='fen_ye_nav']/table/tr/td[last()]/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")

        for p in range(1, int(page) + 1):
            url = "http://www.ppdai.com/lend/12_s1_p" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath("//div[@class='lend_nav']/table/tr")
            if len(loans) > 0:
                for loan in loans:
                    if lxml.html.tostring(loan).find("tit_nav") > 0:
                        continue
                    href = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@href")[0])
                    original_id = href.split("/")[2].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1]
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id)
                        loan_obj.original_id = original_id
                        loan_obj.href = "http://www.ppdai.com" + href
                        loan_obj.title = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@title")[0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(loan.xpath("td[4]/text()")[0]).strip().replace("%", "")
                        period = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "")
                        if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                            loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        else:
                            loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                        loan_obj.schedule = float(str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1])

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #35
0
def crawl():
    company_id = 11
    url = "https://www.tzydb.com"
    request_headers = {'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//div[@id='proList']/ul[@class='item_li']")
        if len(loans) > 0:
            for loan in loans:
                schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip()
                if schedule == "100%" or schedule == "100.0%":
                    #放弃已经结束的
                    continue
                # link = https://www.tzydb.com/boot/lookup/971,1017
                a_script = str(loan.xpath("li/div[1]/div[1]/div/a/@href")[0].encode("utf-8"))
                o_id = ID_RE.findall(a_script)[0]
                original_id = o_id.replace(",", "-")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)
                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.tzydb.com/boot/lookup/" + o_id
                    loan_obj.title = str(loan.xpath("li/div[1]/div[1]/div/a/text()")[0].encode("utf-8"))
                    loan_obj.borrow_amount = str(loan.xpath("li/div[2]/div[1]/span/text()")[0].encode("utf-8")).strip()\
                        .replace(" ", "").replace(",", "")
                    loan_obj.period = str(loan.xpath("li/div[2]/div[3]/span/text()")[0].encode("UTF-8")).strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.rate = str(loan.xpath("li/div[2]/div[2]/span/text()")[0]).strip().replace("%", "")
                    loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #36
0
def crawl():
    company_id = 18
    url = "https://www.my089.com/Loan/default.aspx?pid=1"
    request_headers = {
        'Referee': "http://www.ppdai.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")
        for p in range(1, int(page) + 1):
            url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath(
                "//div[@class='Loan_box']/dl[@class='LoanList']")
            if len(loans) > 0:
                for loan in loans:
                    if str(loan.xpath("dd[last()]/p/span/text()")
                           [0]) == "100%":
                        continue
                    href = str(
                        loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0])
                    original_id = href.split("=")[1].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(
                            loan.xpath("dd[last()]/p/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "https://www.my089.com/Loan/" + href
                        loan_obj.title = str(
                            loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")
                            [0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(
                            loan.xpath("dd[3]/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%/年", "")
                        loan_obj.period = str(
                            loan.xpath("dd[5]/span/text()")[0].encode(
                                "UTF-8")).strip().replace(" ", "")
                        s = str(loan.xpath("dd[5]/text()")[0].encode(
                            "UTF-8")).strip().replace(" ",
                                                      "").replace("个", "")
                        loan_obj.period_unit = s.split("/")[0].strip()
                        loan_obj.repayment = s.split("/")[1].strip()
                        loan_obj.schedule = str(
                            loan.xpath("dd[last()]/p/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #37
0
def crawl():
    company_id = 16
    # url = "http://www.itouzi.com/dinvest/invest/index"
    url = "http://www.itouzi.com/dinvest/debt/index"
    request_headers = {"Referee": "http://www.itouzi.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        # 注意ul的class后面有个空格
        loans = loan_htm_parse.xpath("//ul[@class='invest-product-case-list mtn btn clearfix ']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath("div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']"):
                    continue
                href = str(loan.xpath("h2/a[@class='fl']/@href")[0])
                original_id = href.split("id=")[1]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    # loan_obj = Loan(company_id, original_id)
                    # loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    # loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.itouzi.com" + href
                    loan_obj.title = str(loan.xpath("h2/a[@class='fl']/text()")[0].encode("utf-8")).strip()
                    loan_obj.repayment = (
                        str(loan.xpath("p/span[2]/text()")[0].encode("utf-8")).strip().replace("还款方式:", "")
                    )
                    loan_obj.borrow_amount = int(loan.xpath("p/span[3]/strong/text()")[0]) * 10000

                    loan_obj.rate = (
                        str(loan.xpath("p/span[5]/em[1]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    )
                    period = str(loan.xpath("p/span[4]/strong/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    # 这个进度这块还不确定,需等有标时检查一遍
                    if loan.xpath("div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']"):
                        loan_obj.schedule = (
                            str(
                                loan.xpath(
                                    "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()"
                                )[0].encode("utf-8")
                            )
                            .strip()
                            .replace("%", "")
                        )
                        print loan_obj.schedule
                    # loan_obj.db_create(db)
        #
        #    logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))
        #
        ## db - 新抓取的 = 就是要下线的
        # off_ids_set = db_ids_set - online_ids_set
        # if off_ids_set:
        #    loan_obj = Loan(company_id)
        #    loan_obj.db_offline(db, off_ids_set)
        #    logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #38
0
def crawl():
    company_id = 18
    url = "https://www.my089.com/Loan/default.aspx?pid=1"
    request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")
        for p in range(1, int(page) + 1):
            url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath("//div[@class='Loan_box']/dl[@class='LoanList']")
            if len(loans) > 0:
                for loan in loans:
                    if str(loan.xpath("dd[last()]/p/span/text()")[0]) == "100%":
                        continue
                    href = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0])
                    original_id = href.split("=")[1].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "https://www.my089.com/Loan/" + href
                        loan_obj.title = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")[0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(loan.xpath("dd[3]/span/text()")[0].encode("UTF-8")).strip().replace("%/年", "")
                        loan_obj.period = str(loan.xpath("dd[5]/span/text()")[0].encode("UTF-8")).strip().replace(" ", "")
                        s = str(loan.xpath("dd[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("个", "")
                        loan_obj.period_unit = s.split("/")[0].strip()
                        loan_obj.repayment = s.split("/")[1].strip()
                        loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "")
                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #39
0
def crawl():
    company_id = 3
    url = "http://www.91wangcai.com/invest/index.html"
    request_headers = {
        'Referee': "http://www.91wangcai.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="gb2312")
        loans = loan_htm_parse.xpath("//div[@class='proBoxNew']")

        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='hd']/a/@href")[0])
                original_id = href.split(".")[0].split("/")[2].encode("utf-8")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.91wangcai.com" + href
                    loan_obj.title = autodecode(
                        str(
                            loan.xpath("div[@class='hd']/a/text()")[0].encode(
                                "gb2312"))).encode("utf-8")

                    loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("¥", "")

                    loan_obj.rate = str(
                        loan.xpath(
                            "div[@class='bd']/table/tr[1]/td[2]/em/text()")
                        [0]).strip().replace("%", "")

                    loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \
                        .replace("<em>", "").replace("</em>", "")
                    html_parser = HTMLParser.HTMLParser()
                    period = html_parser.unescape(loan_period_text).encode(
                        "utf-8").strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("还款方式:", "")

                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #40
0
def crawl():
    company_id = 7
    url = "http://www.jimubox.com/Project/List?status=1"
    request_headers = {
        'Referee': "http://www.jimubox.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath(
            "//div[@class='row']/div[@class='span3 project-card']")
        if len(loans) > 0:
            for loan in loans:
                href = str(
                    loan.xpath(
                        "div[@class='project-item']/div[@class='project-item-content']/h4/a/@href"
                    )[0])
                if not href.find("Index") > 0:
                    continue
                original_id = href.split("/")[3].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.jimubox.com" + href
                    loan_obj.title = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/h4/a/text()"
                        )[0].encode("utf-8"))
                    loan_obj.description = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()"
                        )[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\
                        .strip() + "0000"
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")

                    rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    if rate.find("+") > 0:
                        rate_list = rate.split("+")
                        loan_obj.rate = str(
                            float(rate_list[0]) + float(rate_list[1]))
                    else:
                        loan_obj.rate = rate
                    loan_obj.repayment = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/h6/span/text()"
                        )[0].encode("utf-8"))
                    loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #41
0
def crawl():
    company_id = 10
    url = "https://www.xinhehui.com/Financing/Invest/ajaxplist"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath(
            "//table[@class='ui-record-table percentTable mt10']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if loan.xpath("td[last()]/a/@href")[0].encode(
                        "utf-8") == "javascript:;":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8"))
                original_id = href.split("id%3D")[1].encode("utf-8").strip()
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(
                            loan.xpath("td[7]/div/a/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.xinhehui.com" + href
                    title_1 = str(
                        loan.xpath("td[1]/p[1]/a/text()")[0].encode(
                            "utf-8")).strip()
                    if loan.xpath("td[1]/p[1]/a/em"):
                        title_2 = str(
                            loan.xpath("td[1]/p[1]/a/em/text()")[0].encode(
                                "utf-8")).strip()
                    else:
                        title_2 = str(
                            loan.xpath("td[1]/p[1]/a/span/text()")[0].encode(
                                "utf-8")).strip()
                    loan_obj.title = title_1 + title_2
                    borrow_amount = str(
                        loan.xpath("td[2]/span/text()")[0].encode(
                            "utf-8")).strip().replace(" ", "")
                    if borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = float(
                            borrow_amount.replace("万", "")) * 10000
                    else:
                        loan_obj.borrow_amount = float(
                            borrow_amount.replace("元", "").replace(",", ""))

                    if loan.xpath("td[4]/span"):
                        period = str(
                            loan.xpath("td[4]/span/@title")[0].encode(
                                "UTF-8")).strip()
                    else:
                        period = str(
                            loan.xpath("td[4]/text()")[0].encode(
                                "UTF-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.rate = str(
                        loan.xpath("td[3]/p/text()")[0]).strip().replace(
                            "%", "")
                    loan_obj.repayment = str(
                        loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(
                            loan.xpath("td[7]/div/a/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #42
0
def crawl():
    company_id = 24
    url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = loads(htm, encoding="utf-8")
        loans = htm_obj["list"]
        if len(loans) > 0:
            for loan in loans:
                if str(loan["bID_SCHD"]) == "100":
                    #放弃已经结束的
                    continue
                original_id = loan["lN_NO"]
                href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = loan["lN_NM"]
                    loan_obj.borrow_amount = loan["lN_AMT"]
                    loan_obj.rate = loan["lN_RATE"]
                    loan_obj.period = loan["lN_TERM"]
                    loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"]
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.repayment = loan["pAY_METH_DESC"]

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())