Beispiel #1
0
class LoanServer(loan_pb2_grpc.LoanServicer):
    """
    docstring
    """
    def __init__(self):
        self.loan = None

    def save_loan(self, request, context):
        email = request.email

        interest_rate = request.interest_rate

        repayment_terms = request.repayment_terms

        loan_amount = request.loan_amount

        self.loan = Loan(email, interest_rate, repayment_terms, loan_amount)

        self.loan.save_loan()

        response = loan_pb2.empty()

        return response

    def show_installment(self, request, context):
        email = request.email

        response = loan_pb2.installement_response()

        response.installement_message = self.loan.show_installment(email)

        return response

    def repayment(self, request, context):
        email = request.email

        repayment_amount = request.repayment_amount

        self.loan.repayment(email, repayment_amount)

        response = loan_pb2.empty()

        return response
Beispiel #2
0
def crawl():
    company_id = 3
    url = "http://www.91wangcai.com/invest/index.html"
    request_headers = {'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="gb2312")
        loans = loan_htm_parse.xpath("//div[@class='proBoxNew']")

        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='hd']/a/@href")[0])
                original_id = href.split(".")[0].split("/")[2].encode("utf-8")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.91wangcai.com" + href
                    loan_obj.title = autodecode(str(loan.xpath("div[@class='hd']/a/text()")[0].encode("gb2312"))).encode("utf-8")

                    loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("¥", "")

                    loan_obj.rate = str(loan.xpath("div[@class='bd']/table/tr[1]/td[2]/em/text()")[0]).strip().replace("%", "")

                    loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \
                        .replace("<em>", "").replace("</em>", "")
                    html_parser = HTMLParser.HTMLParser()
                    period = html_parser.unescape(loan_period_text).encode("utf-8").strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("还款方式:", "")

                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #3
0
def crawl():
    company_id = 21
    url = "http://www.id68.cn/invest/index/borrow_status/9/p/1.html"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//ul[@class='ideal_con']/li")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]) == "100.00%":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("table/tr[1]/td[1]/a/@href")[0].encode("utf-8"))
                original_id = href.replace(".html", "").split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = REFEREE + href
                    loan_obj.title = str(loan.xpath("table/tr[1]/td[1]/a/@title")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("table/tr[2]/td[last()]/span/text()")[0].encode("utf-8"))\
                        .strip().replace(" ", "").replace(",", "")
                    loan_obj.repayment = str(loan.xpath("table/tr[2]/td[4]/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.rate = str(loan.xpath("table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("table/tr[2]/td[3]/span/text()")[0].encode("utf-8")).strip()\
                        .replace(" ", "").replace("个月", "")
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #4
0
def crawl():
    company_id = 26
    url = "http://www.longlongweb.com/invests"
    request_headers = {"Referee": "http://www.longlongweb.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="utf-8")
        loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl")
        if len(loans) > 0:
            for loan in loans:
                if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0:
                    continue

                href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0])
                original_id = href.split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = (
                        str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )

                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = (
                        str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8"))
                    loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).replace(
                        "%", ""
                    )
                    loan_obj.borrow_amount = (
                        str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = (
                        str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.repayment = (
                        str(
                            loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0].encode(
                                "utf-8"
                            )
                        )
                        .strip()
                        .replace("还款方式:", "")
                    )

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))
    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #5
0
def crawl():
    company_id = 18
    url = "https://www.my089.com/Loan/default.aspx?pid=1"
    request_headers = {
        'Referee': "http://www.ppdai.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")
        for p in range(1, int(page) + 1):
            url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath(
                "//div[@class='Loan_box']/dl[@class='LoanList']")
            if len(loans) > 0:
                for loan in loans:
                    if str(loan.xpath("dd[last()]/p/span/text()")
                           [0]) == "100%":
                        continue
                    href = str(
                        loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0])
                    original_id = href.split("=")[1].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(
                            loan.xpath("dd[last()]/p/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "https://www.my089.com/Loan/" + href
                        loan_obj.title = str(
                            loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")
                            [0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(
                            loan.xpath("dd[3]/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%/年", "")
                        loan_obj.period = str(
                            loan.xpath("dd[5]/span/text()")[0].encode(
                                "UTF-8")).strip().replace(" ", "")
                        s = str(loan.xpath("dd[5]/text()")[0].encode(
                            "UTF-8")).strip().replace(" ",
                                                      "").replace("个", "")
                        loan_obj.period_unit = s.split("/")[0].strip()
                        loan_obj.repayment = s.split("/")[1].strip()
                        loan_obj.schedule = str(
                            loan.xpath("dd[last()]/p/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #6
0
def crawl():
    company_id = 16
    # url = "http://www.itouzi.com/dinvest/invest/index"
    url = "http://www.itouzi.com/dinvest/debt/index"
    request_headers = {"Referee": "http://www.itouzi.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        # 注意ul的class后面有个空格
        loans = loan_htm_parse.xpath("//ul[@class='invest-product-case-list mtn btn clearfix ']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath("div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']"):
                    continue
                href = str(loan.xpath("h2/a[@class='fl']/@href")[0])
                original_id = href.split("id=")[1]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    # loan_obj = Loan(company_id, original_id)
                    # loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    # loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.itouzi.com" + href
                    loan_obj.title = str(loan.xpath("h2/a[@class='fl']/text()")[0].encode("utf-8")).strip()
                    loan_obj.repayment = (
                        str(loan.xpath("p/span[2]/text()")[0].encode("utf-8")).strip().replace("还款方式:", "")
                    )
                    loan_obj.borrow_amount = int(loan.xpath("p/span[3]/strong/text()")[0]) * 10000

                    loan_obj.rate = (
                        str(loan.xpath("p/span[5]/em[1]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    )
                    period = str(loan.xpath("p/span[4]/strong/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    # 这个进度这块还不确定,需等有标时检查一遍
                    if loan.xpath("div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']"):
                        loan_obj.schedule = (
                            str(
                                loan.xpath(
                                    "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()"
                                )[0].encode("utf-8")
                            )
                            .strip()
                            .replace("%", "")
                        )
                        print loan_obj.schedule
                    # loan_obj.db_create(db)
        #
        #    logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))
        #
        ## db - 新抓取的 = 就是要下线的
        # off_ids_set = db_ids_set - online_ids_set
        # if off_ids_set:
        #    loan_obj = Loan(company_id)
        #    loan_obj.db_offline(db, off_ids_set)
        #    logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #7
0
def crawl():
    company_id = 9
    url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1"
    request_headers = {
        'Referee': "https://list.lufax.com/list/listing/fuying",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        loan_num = loans_json["totalCount"]
        if loans_json and loan_num:
            for i in range(0, loan_num):
                original_id = str(loans_json["data"][i]["productId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(
                        float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(
                        int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id
                    loan_obj.title = loans_json["data"][i][
                        "productNameDisplay"]
                    loan_obj.rate = str(
                        float(loans_json["data"][i]["interestRate"]) * 100)
                    period = str(loans_json["data"][i]
                                 ["investPeriodDisplay"].encode("utf-8"))
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.repayment = loans_json["data"][i][
                        "collectionModeDisplay"]
                    loan_obj.borrow_amount = str(
                        int(loans_json["data"][i]["price"]))
                    loan_obj.schedule = str(
                        float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(
                        int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #8
0
def crawl():
    company_id = 15
    url = "https://www.iqianbang.com/invest"
    request_headers = {
        'Referee': "https://www.iqianbang.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("td[7]/text()")[0].encode(
                        "utf-8")).strip() != "融资中":
                    continue
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("-")[3].replace(".shtml", "")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(
                        loan.xpath("td[6]/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.iqianbang.com" + href
                    loan_obj.title = str(
                        loan.xpath("td[1]/a/text()")[0].encode(
                            "utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "").replace("元", "")
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(
                            loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = str(
                        loan.xpath("td[2]/span/span/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")
                    period = str(
                        loan.xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(
                        loan.xpath("td[6]/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")

                    # 这里需要进入详情页
                    loan_info_htm = download_page(loan_obj.href,
                                                  headers={
                                                      'Referee': url,
                                                      'User-Agent': DEFAULT_UA
                                                  })
                    loan_info_htm_parse = parse_html(loan_info_htm,
                                                     encoding="UTF-8")
                    loan_obj.repayment = str(
                        loan_info_htm_parse.xpath(
                            "//div[@class='inright']/table[@class='idetable']")
                        [0].xpath("tr[2]/td[2]/span/text()")[0].encode(
                            "utf-8")).strip()

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #9
0
def crawl():
    company_id = 20
    url = "http://www.xiaomabank.com/finance.do"
    request_headers = {"Referee": "http://www.xiaomabank.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = (
                        str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8"))
                        .replace("width:", "")
                        .strip()
                        .replace("%;", "")
                    )
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.xiaomabank.com/" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = (
                        str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "")
                    )
                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = (
                        str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8"))
                        .replace("width:", "")
                        .strip()
                        .replace("%;", "")
                    )

                    # 注意这里页面返回的gzip压缩后的,需要解压
                    resp = urllib2.urlopen(loan_obj.href)
                    respInfo = resp.info()
                    if ("Content-Encoding" in respInfo) and (respInfo["Content-Encoding"] == "gzip"):
                        respHtml = zlib.decompress(resp.read(), 16 + zlib.MAX_WBITS)
                        info_htm_parse = parse_html(respHtml, encoding="utf-8")
                        loan_obj.repayment = str(
                            info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8")
                        )

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #10
0
def crawl():
    company_id = 10
    url = "https://www.xinhehui.com/Financing/Invest/ajaxplist"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//table[@class='ui-record-table percentTable mt10']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if loan.xpath("td[last()]/a/@href")[0].encode("utf-8") == "javascript:;":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8"))
                original_id = href.split("id%3D")[1].encode("utf-8").strip()
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.xinhehui.com" + href
                    title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip()
                    if loan.xpath("td[1]/p[1]/a/em"):
                        title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip()
                    else:
                        title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.title = title_1 + title_2
                    borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "")
                    if borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000
                    else:
                        loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", ""))

                    if loan.xpath("td[4]/span"):
                        period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip()
                    else:
                        period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "")
                    loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #11
0
def crawl():
    company_id = 7
    url = "http://www.jimubox.com/Project/List?status=1"
    request_headers = {'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='row']/div[@class='span3 project-card']")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/@href")[0])
                if not href.find("Index") > 0:
                    continue
                original_id = href.split("/")[3].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.jimubox.com" + href
                    loan_obj.title = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/text()")[0].encode("utf-8"))
                    loan_obj.description = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\
                        .strip() + "0000"
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")

                    rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    if rate.find("+") > 0:
                        rate_list = rate.split("+")
                        loan_obj.rate = str(float(rate_list[0]) + float(rate_list[1]))
                    else:
                        loan_obj.rate = rate
                    loan_obj.repayment = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h6/span/text()")[0].encode("utf-8"))
                    loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #12
0
def crawl():
    company_id = 9
    url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1"
    request_headers = {'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        loan_num = loans_json["totalCount"]
        if loans_json and loan_num:
            for i in range(0, loan_num):
                original_id = str(loans_json["data"][i]["productId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id
                    loan_obj.title = loans_json["data"][i]["productNameDisplay"]
                    loan_obj.rate = str(float(loans_json["data"][i]["interestRate"]) * 100)
                    period = str(loans_json["data"][i]["investPeriodDisplay"].encode("utf-8"))
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.repayment = loans_json["data"][i]["collectionModeDisplay"]
                    loan_obj.borrow_amount = str(int(loans_json["data"][i]["price"]))
                    loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #13
0
def crawl():
    company_id = 24
    url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = loads(htm, encoding="utf-8")
        loans = htm_obj["list"]
        if len(loans) > 0:
            for loan in loans:
                if str(loan["bID_SCHD"]) == "100":
                    #放弃已经结束的
                    continue
                original_id = loan["lN_NO"]
                href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = loan["lN_NM"]
                    loan_obj.borrow_amount = loan["lN_AMT"]
                    loan_obj.rate = loan["lN_RATE"]
                    loan_obj.period = loan["lN_TERM"]
                    loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"]
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.repayment = loan["pAY_METH_DESC"]

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #14
0
def crawl():
    company_id = 18
    url = "https://www.my089.com/Loan/default.aspx?pid=1"
    request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")
        for p in range(1, int(page) + 1):
            url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath("//div[@class='Loan_box']/dl[@class='LoanList']")
            if len(loans) > 0:
                for loan in loans:
                    if str(loan.xpath("dd[last()]/p/span/text()")[0]) == "100%":
                        continue
                    href = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0])
                    original_id = href.split("=")[1].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "https://www.my089.com/Loan/" + href
                        loan_obj.title = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")[0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(loan.xpath("dd[3]/span/text()")[0].encode("UTF-8")).strip().replace("%/年", "")
                        loan_obj.period = str(loan.xpath("dd[5]/span/text()")[0].encode("UTF-8")).strip().replace(" ", "")
                        s = str(loan.xpath("dd[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("个", "")
                        loan_obj.period_unit = s.split("/")[0].strip()
                        loan_obj.repayment = s.split("/")[1].strip()
                        loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "")
                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #15
0
def crawl():
    company_id = 17
    #url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512"
    s = int(time.time() * 1000)
    e = s + 1
    url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d"
    url_1 = url % (0, s, e)
    request_headers = {
        'Referee': "http://www.touna.cn/invest-list.html",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url_1, request_headers)
        htm_json = loads(htm, encoding="UTF-8")
        page_count = htm_json["result"]["pages"]["count"]
        page = page_count / 10
        if page_count % 10 > 0:
            page += 1
        if page > 0:
            for p in range(0, page):
                # 重新计算当前时间
                s = int(time.time() * 1000)
                e = s + 1
                page_url = url % (p, s, e)
                loan_htm = download_page(page_url, request_headers)
                loans_json = loads(loan_htm, encoding="UTF-8")
                loans = loans_json["result"]["list"]
                for loan in loans:
                    original_id = str(loan["id"])
                    if original_id:
                        online_ids_set.add(original_id)
                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int(
                            original_id)
                        loan_obj.title = loan["name"]
                        loan_obj.borrow_amount = loan["account"]
                        loan_obj.rate = loan["apr"]
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.repayment = loan["style_name"]
                        period = str(loan["time_limit_name"].encode("utf-8"))
                        if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                            loan_obj.period = period.replace(
                                loan_obj.PERIOD_UNIT_DAY, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        else:
                            loan_obj.period = period.replace("个", "").replace(
                                loan_obj.PERIOD_UNIT_MONTH, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

            # db - 新抓取的 = 就是要下线的
            off_ids_set = db_ids_set - online_ids_set
            if off_ids_set:
                loan_obj = Loan(company_id)
                loan_obj.db_offline(db, off_ids_set)
                logger.info("company %s crawler loan: offline %s", company_id,
                            len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #16
0
def crawl():
    company_id = 26
    url = "http://www.longlongweb.com/invests"
    request_headers = {'Referee': "http://www.longlongweb.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="utf-8")
        loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl")
        if len(loans) > 0:
            for loan in loans:
                if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0:
                    continue

                href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0])
                original_id = href.split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\
                        .replace(",", "").replace("¥", "").strip()

                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0]
                                        .encode("utf-8")).replace(",", "").replace("¥", "").strip()
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8"))
                    loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8"))\
                        .replace("%", "")
                    loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\
                        .replace(",", "").replace("¥", "").strip()
                    loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0]
                                        .encode("utf-8")).replace(",", "").replace("¥", "").strip()
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.repayment = str(loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0]
                                             .encode("utf-8")).strip().replace("还款方式:", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))
    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #17
0
def crawl():
    company_id = 16
    #url = "http://www.itouzi.com/dinvest/invest/index"
    url = "http://www.itouzi.com/dinvest/debt/index"
    request_headers = {
        'Referee': "http://www.itouzi.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        # 注意ul的class后面有个空格
        loans = loan_htm_parse.xpath(
            "//ul[@class='invest-product-case-list mtn btn clearfix ']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath(
                        "div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']"
                ):
                    continue
                href = str(loan.xpath("h2/a[@class='fl']/@href")[0])
                original_id = href.split("id=")[1]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    #loan_obj = Loan(company_id, original_id)
                    #loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    #loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.itouzi.com" + href
                    loan_obj.title = str(
                        loan.xpath("h2/a[@class='fl']/text()")[0].encode(
                            "utf-8")).strip()
                    loan_obj.repayment = str(loan.xpath("p/span[2]/text()")[0].encode("utf-8"))\
                        .strip().replace("还款方式:", "")
                    loan_obj.borrow_amount = int(
                        loan.xpath("p/span[3]/strong/text()")[0]) * 10000

                    loan_obj.rate = str(
                        loan.xpath("p/span[5]/em[1]/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")
                    period = str(
                        loan.xpath("p/span[4]/strong/text()")[0].encode(
                            "utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    # 这个进度这块还不确定,需等有标时检查一遍
                    if loan.xpath(
                            "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']"
                    ):
                        loan_obj.schedule = str(
                            loan.xpath(
                                "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()"
                            )[0].encode("utf-8")).strip().replace("%", "")
                        print loan_obj.schedule
                    #loan_obj.db_create(db)
        #
        #    logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))
        #
        ## db - 新抓取的 = 就是要下线的
        #off_ids_set = db_ids_set - online_ids_set
        #if off_ids_set:
        #    loan_obj = Loan(company_id)
        #    loan_obj.db_offline(db, off_ids_set)
        #    logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #18
0
def crawl():
    company_id = 20
    url = "http://www.xiaomabank.com/finance.do"
    request_headers = {'Referee': "http://www.xiaomabank.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.xiaomabank.com/" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "")
                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "")

                    # 注意这里页面返回的gzip压缩后的,需要解压
                    resp = urllib2.urlopen(loan_obj.href)
                    respInfo = resp.info()
                    if(("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")):
                        respHtml = zlib.decompress(resp.read(), 16+zlib.MAX_WBITS)
                        info_htm_parse = parse_html(respHtml, encoding="utf-8")
                        loan_obj.repayment = str(info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8"))

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #19
0
def crawl():
    company_id = 17
    # url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512"
    s = int(time.time() * 1000)
    e = s + 1
    url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d"
    url_1 = url % (0, s, e)
    request_headers = {"Referee": "http://www.touna.cn/invest-list.html", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        htm = download_page(url_1, request_headers)
        htm_json = loads(htm, encoding="UTF-8")
        page_count = htm_json["result"]["pages"]["count"]
        page = page_count / 10
        if page_count % 10 > 0:
            page += 1
        if page > 0:
            for p in range(0, page):
                # 重新计算当前时间
                s = int(time.time() * 1000)
                e = s + 1
                page_url = url % (p, s, e)
                loan_htm = download_page(page_url, request_headers)
                loans_json = loads(loan_htm, encoding="UTF-8")
                loans = loans_json["result"]["list"]
                for loan in loans:
                    original_id = str(loan["id"])
                    if original_id:
                        online_ids_set.add(original_id)
                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int(original_id)
                        loan_obj.title = loan["name"]
                        loan_obj.borrow_amount = loan["account"]
                        loan_obj.rate = loan["apr"]
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.repayment = loan["style_name"]
                        period = str(loan["time_limit_name"].encode("utf-8"))
                        if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                            loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        else:
                            loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

            # db - 新抓取的 = 就是要下线的
            off_ids_set = db_ids_set - online_ids_set
            if off_ids_set:
                loan_obj = Loan(company_id)
                loan_obj.db_offline(db, off_ids_set)
                logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #20
0
def crawl():
    company_id = 25
    url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \
          "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \
          "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page((url % 1), request_headers)
        obj = loads(htm, encoding="utf-8")
        total = int(obj["total"])
        if total > 0:
            page = total / 5
            if total % 5 > 0:
                page += 1
            for p in range(1, page+1):
                htm = download_page((url % p), request_headers)
                htm_obj = loads(htm, encoding="utf-8")
                loans = htm_obj["projectList"]
                for loan in loans:
                    original_id = loan["ID"]
                    href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = href
                        loan_obj.title = loan["Title"]
                        loan_obj.borrow_amount = loan["TotalAmount"]
                        loan_obj.rate = loan["YearRate"]
                        loan_obj.period = loan["Deadline"]
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.repayment = loan["RepaymentTypeDesc"]

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #21
0
def crawl():
    company_id = 15
    url = "https://www.iqianbang.com/invest"
    request_headers = {"Referee": "https://www.iqianbang.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("td[7]/text()")[0].encode("utf-8")).strip() != "融资中":
                    continue
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("-")[3].replace(".shtml", "")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.iqianbang.com" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = (
                        str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace(",", "").replace("元", "")
                    )
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = (
                        str(loan.xpath("td[2]/span/span/text()")[0].encode("utf-8")).strip().replace("%", "")
                    )
                    period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")

                    # 这里需要进入详情页
                    loan_info_htm = download_page(loan_obj.href, headers={"Referee": url, "User-Agent": DEFAULT_UA})
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8")
                    loan_obj.repayment = str(
                        loan_info_htm_parse.xpath("//div[@class='inright']/table[@class='idetable']")[0]
                        .xpath("tr[2]/td[2]/span/text()")[0]
                        .encode("utf-8")
                    ).strip()

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #22
0
def crawl():
    company_id = 25
    url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \
          "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \
          "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page((url % 1), request_headers)
        obj = loads(htm, encoding="utf-8")
        total = int(obj["total"])
        if total > 0:
            page = total / 5
            if total % 5 > 0:
                page += 1
            for p in range(1, page + 1):
                htm = download_page((url % p), request_headers)
                htm_obj = loads(htm, encoding="utf-8")
                loans = htm_obj["projectList"]
                for loan in loans:
                    original_id = loan["ID"]
                    href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = href
                        loan_obj.title = loan["Title"]
                        loan_obj.borrow_amount = loan["TotalAmount"]
                        loan_obj.rate = loan["YearRate"]
                        loan_obj.period = loan["Deadline"]
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.repayment = loan["RepaymentTypeDesc"]

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #23
0
def crawl():
    company_id = 3
    url = "http://www.91wangcai.com/invest/index.html"
    request_headers = {
        'Referee': "http://www.91wangcai.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="gb2312")
        loans = loan_htm_parse.xpath("//div[@class='proBoxNew']")

        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='hd']/a/@href")[0])
                original_id = href.split(".")[0].split("/")[2].encode("utf-8")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.91wangcai.com" + href
                    loan_obj.title = autodecode(
                        str(
                            loan.xpath("div[@class='hd']/a/text()")[0].encode(
                                "gb2312"))).encode("utf-8")

                    loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("¥", "")

                    loan_obj.rate = str(
                        loan.xpath(
                            "div[@class='bd']/table/tr[1]/td[2]/em/text()")
                        [0]).strip().replace("%", "")

                    loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \
                        .replace("<em>", "").replace("</em>", "")
                    html_parser = HTMLParser.HTMLParser()
                    period = html_parser.unescape(loan_period_text).encode(
                        "utf-8").strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("还款方式:", "")

                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #24
0
def crawl():
    company_id = 7
    url = "http://www.jimubox.com/Project/List?status=1"
    request_headers = {
        'Referee': "http://www.jimubox.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath(
            "//div[@class='row']/div[@class='span3 project-card']")
        if len(loans) > 0:
            for loan in loans:
                href = str(
                    loan.xpath(
                        "div[@class='project-item']/div[@class='project-item-content']/h4/a/@href"
                    )[0])
                if not href.find("Index") > 0:
                    continue
                original_id = href.split("/")[3].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.jimubox.com" + href
                    loan_obj.title = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/h4/a/text()"
                        )[0].encode("utf-8"))
                    loan_obj.description = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()"
                        )[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\
                        .strip() + "0000"
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")

                    rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    if rate.find("+") > 0:
                        rate_list = rate.split("+")
                        loan_obj.rate = str(
                            float(rate_list[0]) + float(rate_list[1]))
                    else:
                        loan_obj.rate = rate
                    loan_obj.repayment = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/h6/span/text()"
                        )[0].encode("utf-8"))
                    loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #25
0
def crawl():
    company_id = 10
    url = "https://www.xinhehui.com/Financing/Invest/ajaxplist"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath(
            "//table[@class='ui-record-table percentTable mt10']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if loan.xpath("td[last()]/a/@href")[0].encode(
                        "utf-8") == "javascript:;":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8"))
                original_id = href.split("id%3D")[1].encode("utf-8").strip()
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(
                            loan.xpath("td[7]/div/a/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.xinhehui.com" + href
                    title_1 = str(
                        loan.xpath("td[1]/p[1]/a/text()")[0].encode(
                            "utf-8")).strip()
                    if loan.xpath("td[1]/p[1]/a/em"):
                        title_2 = str(
                            loan.xpath("td[1]/p[1]/a/em/text()")[0].encode(
                                "utf-8")).strip()
                    else:
                        title_2 = str(
                            loan.xpath("td[1]/p[1]/a/span/text()")[0].encode(
                                "utf-8")).strip()
                    loan_obj.title = title_1 + title_2
                    borrow_amount = str(
                        loan.xpath("td[2]/span/text()")[0].encode(
                            "utf-8")).strip().replace(" ", "")
                    if borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = float(
                            borrow_amount.replace("万", "")) * 10000
                    else:
                        loan_obj.borrow_amount = float(
                            borrow_amount.replace("元", "").replace(",", ""))

                    if loan.xpath("td[4]/span"):
                        period = str(
                            loan.xpath("td[4]/span/@title")[0].encode(
                                "UTF-8")).strip()
                    else:
                        period = str(
                            loan.xpath("td[4]/text()")[0].encode(
                                "UTF-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.rate = str(
                        loan.xpath("td[3]/p/text()")[0]).strip().replace(
                            "%", "")
                    loan_obj.repayment = str(
                        loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(
                            loan.xpath("td[7]/div/a/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #26
0
def crawl():
    company_id = 24
    url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = loads(htm, encoding="utf-8")
        loans = htm_obj["list"]
        if len(loans) > 0:
            for loan in loans:
                if str(loan["bID_SCHD"]) == "100":
                    #放弃已经结束的
                    continue
                original_id = loan["lN_NO"]
                href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = loan["lN_NM"]
                    loan_obj.borrow_amount = loan["lN_AMT"]
                    loan_obj.rate = loan["lN_RATE"]
                    loan_obj.period = loan["lN_TERM"]
                    loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"]
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.repayment = loan["pAY_METH_DESC"]

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())