Exemple #1
0
def crawl():
    company_id = 9
    url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1"
    request_headers = {
        'Referee': "https://list.lufax.com/list/listing/fuying",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        loan_num = loans_json["totalCount"]
        if loans_json and loan_num:
            for i in range(0, loan_num):
                original_id = str(loans_json["data"][i]["productId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(
                        float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(
                        int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id
                    loan_obj.title = loans_json["data"][i][
                        "productNameDisplay"]
                    loan_obj.rate = str(
                        float(loans_json["data"][i]["interestRate"]) * 100)
                    period = str(loans_json["data"][i]
                                 ["investPeriodDisplay"].encode("utf-8"))
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.repayment = loans_json["data"][i][
                        "collectionModeDisplay"]
                    loan_obj.borrow_amount = str(
                        int(loans_json["data"][i]["price"]))
                    loan_obj.schedule = str(
                        float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(
                        int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #2
0
def crawl():
    company_id = 4
    url = "https://www.yinhu.com/loan/loan_list.bl"
    request_headers = {'Referee': "https://www.yinhu.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()
    # offline
    off_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@id='loan_list']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/p/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")
                try:
                    loan_status = str(loan.xpath("td[last()]/em/span/text()")[0].encode("utf-8")).strip()
                except:
                    loan_status = str(loan.xpath("td[last()]/a/span/text()")[0].encode("utf-8")).strip()

                if original_id and loan_status != "还款中":
                    online_ids_set.add(original_id)

                if loan_status == "还款中" or loan_status == "满标":
                    if original_id in db_ids_set:
                        off_ids_set.add(original_id)
                    continue

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\
                        .strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.yinhu.com" + href
                    loan_obj.title = str(loan.xpath("td[1]/p/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip().replace(",", "")\
                        .replace("元", "")

                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip()
                    period = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\
                        .strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #3
0
def crawl():
    company_id = 8
    url = "http://www.eloancn.com/new/loadAllTender.action?page=3&sidx=progress&sord=desc"
    request_headers = {'Referee': "http://www.eloancn.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        for p in range(1, 4):
            url = "http://www.eloancn.com/new/loadAllTender.action?page=%s" % p
            logger.info("page url:%s", url)
            # 这个页面比较恶心,一个标的的属性不在一个div内
            loan_htm = download_page(url, request_headers)
            loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
            htm_1 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd300 pdl10 fl']")
            htm_2 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd140 fl']")
            htm_3 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd130 fl pdl10']")
            htm_4 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd130 fl']")

            loan_list = []
            for h1 in htm_1:
                loan_obj = Loan(company_id)
                loan_obj.title = str(h1.xpath("h3/a[@class='fl']/text()")[0].encode("utf-8"))
                loan_obj.href = str(h1.xpath("h3/a[@class='fl']/@href")[0]).replace(":80", "")
                loan_obj.original_id = loan_obj.href.split("=")[1]
                loan_list.append(loan_obj)
            for index, h2 in enumerate(htm_2):
                loan_list[index].borrow_amount = str(h2.xpath("p[@class='colorCb mt10']/text()")[0].encode("utf-8")).replace("¥","").replace(",","")
                loan_list[index].rate = str(h2.xpath("p[@class='colorE6']/span/text()")[0]).replace("%", "")
            for index, h3 in enumerate(htm_3):
                loan_list[index].period = str(h3.xpath("p/span/text()")[0].encode("utf-8"))
                loan_list[index].period_unit = loan_obj.PERIOD_UNIT_MONTH
                loan_list[index].repayment = str(h3.xpath("p[@class='']/text()")[0].encode("utf-8"))
            for index, h4 in enumerate(htm_4):
                loan_list[index].schedule = str(h4.xpath("p/span/em/text()")[0]).strip().replace("%", "")

            # 去掉已经满标的
            new_list = [i for i in loan_list if i.schedule != "100"]

            for loan in new_list:
                online_ids_set.add(loan.original_id)
                if loan.original_id in db_ids_set:
                    update_ids_set.add(loan.original_id)

                    loan.db_update(db)
                else:
                    new_ids_set.add(loan.original_id)

                    loan.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

            time.sleep(5)

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj_off = Loan(company_id)
            loan_obj_off.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))


    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #4
0
def crawl():
    company_id = 3
    url = "http://www.91wangcai.com/invest/index.html"
    request_headers = {'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="gb2312")
        loans = loan_htm_parse.xpath("//div[@class='proBoxNew']")

        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='hd']/a/@href")[0])
                original_id = href.split(".")[0].split("/")[2].encode("utf-8")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.91wangcai.com" + href
                    loan_obj.title = autodecode(str(loan.xpath("div[@class='hd']/a/text()")[0].encode("gb2312"))).encode("utf-8")

                    loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("¥", "")

                    loan_obj.rate = str(loan.xpath("div[@class='bd']/table/tr[1]/td[2]/em/text()")[0]).strip().replace("%", "")

                    loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \
                        .replace("<em>", "").replace("</em>", "")
                    html_parser = HTMLParser.HTMLParser()
                    period = html_parser.unescape(loan_period_text).encode("utf-8").strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("还款方式:", "")

                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #5
0
def crawl():
    company_id = 15
    url = "https://www.iqianbang.com/invest"
    request_headers = {
        'Referee': "https://www.iqianbang.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("td[7]/text()")[0].encode(
                        "utf-8")).strip() != "融资中":
                    continue
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("-")[3].replace(".shtml", "")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(
                        loan.xpath("td[6]/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.iqianbang.com" + href
                    loan_obj.title = str(
                        loan.xpath("td[1]/a/text()")[0].encode(
                            "utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "").replace("元", "")
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(
                            loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = str(
                        loan.xpath("td[2]/span/span/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")
                    period = str(
                        loan.xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(
                        loan.xpath("td[6]/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")

                    # 这里需要进入详情页
                    loan_info_htm = download_page(loan_obj.href,
                                                  headers={
                                                      'Referee': url,
                                                      'User-Agent': DEFAULT_UA
                                                  })
                    loan_info_htm_parse = parse_html(loan_info_htm,
                                                     encoding="UTF-8")
                    loan_obj.repayment = str(
                        loan_info_htm_parse.xpath(
                            "//div[@class='inright']/table[@class='idetable']")
                        [0].xpath("tr[2]/td[2]/span/text()")[0].encode(
                            "utf-8")).strip()

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #6
0
def crawl():
    company_id = 18
    url = "https://www.my089.com/Loan/default.aspx?pid=1"
    request_headers = {
        'Referee': "http://www.ppdai.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")
        for p in range(1, int(page) + 1):
            url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath(
                "//div[@class='Loan_box']/dl[@class='LoanList']")
            if len(loans) > 0:
                for loan in loans:
                    if str(loan.xpath("dd[last()]/p/span/text()")
                           [0]) == "100%":
                        continue
                    href = str(
                        loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0])
                    original_id = href.split("=")[1].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(
                            loan.xpath("dd[last()]/p/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "https://www.my089.com/Loan/" + href
                        loan_obj.title = str(
                            loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")
                            [0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(
                            loan.xpath("dd[3]/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%/年", "")
                        loan_obj.period = str(
                            loan.xpath("dd[5]/span/text()")[0].encode(
                                "UTF-8")).strip().replace(" ", "")
                        s = str(loan.xpath("dd[5]/text()")[0].encode(
                            "UTF-8")).strip().replace(" ",
                                                      "").replace("个", "")
                        loan_obj.period_unit = s.split("/")[0].strip()
                        loan_obj.repayment = s.split("/")[1].strip()
                        loan_obj.schedule = str(
                            loan.xpath("dd[last()]/p/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #7
0
def crawl():
    company_id = 12
    url = "http://www.renrendai.com/lend/loanList.action"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans_script = htm_obj.xpath("//script[@id='loan-list-rsp']/text()")[0].encode("utf-8")
        loans_json = loads(loans_script, encoding="UTF-8")
        loan_size = len(loans_json["data"]["loans"])
        if loan_size > 0:
            for i in range(0, loan_size):
                if loans_json["data"]["loans"][i]["status"] != "OPEN":
                    #放弃已经结束的
                    continue
                original_id = str(int(loans_json["data"]["loans"][i]["loanId"]))
                href = "http://www.renrendai.com/lend/detailPage.action?loanId=%s" % original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0]
                    loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"]))
                    loan_obj.db_update(db)
                else:
                    pass
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = str(loans_json["data"]["loans"][i]["title"].encode("utf-8"))
                    loan_obj.borrow_amount = str(loans_json["data"]["loans"][i]["amount"])
                    loan_obj.period = str(int(loans_json["data"]["loans"][i]["months"]))
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.rate = str(loans_json["data"]["loans"][i]["interest"])
                    loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"]))
                    loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0]
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #8
0
def crawl():
    company_id = 23
    url = "https://member.niwodai.com/xiangmu/"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm, encoding="utf-8")
        loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\
            .replace("共", "").replace("个标", "").strip())
        if loan_size > 0:
            page = loan_size / 10
            if loan_size % 10 > 0:
                page += 1
            for p in range(1, page+1):
                page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % (p, loan_size)
                page_html = download_page(page_url, request_headers)
                page_obj = parse_html(page_html, encoding="utf-8")
                loans = page_obj.xpath("//div[@class='biaoList']/table/tbody/tr")
                for loan in loans:
                    if lxml.html.tostring(loan).find("<th>") > 0:
                        continue
                    href = str(loan.xpath("td[1]/a/@href")[0])
                    original_id = href.replace(".html", "").split("/")[2]
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = REFEREE + href
                        loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                        loan_obj.borrow_amount = str(loan.xpath("td[4]/em/text()")[0].encode("utf-8")).strip().replace(",", "")
                        loan_obj.rate = str(loan.xpath("td[2]/em/text()")[0].encode("utf-8")).strip().replace("%", "")
                        loan_obj.period = str(loan.xpath("td[3]/em/text()")[0].encode("utf-8")).strip()
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "")

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #9
0
def crawl():
    company_id = 8
    url = "http://www.eloancn.com/new/loadAllTender.action?page=3&sidx=progress&sord=desc"
    request_headers = {
        'Referee': "http://www.eloancn.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        for p in range(1, 4):
            url = "http://www.eloancn.com/new/loadAllTender.action?page=%s" % p
            logger.info("page url:%s", url)
            # 这个页面比较恶心,一个标的的属性不在一个div内
            loan_htm = download_page(url, request_headers)
            loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
            htm_1 = loan_htm_parse.xpath(
                "//div[@class='lendtable']/dl/dd[@class='wd300 pdl10 fl']")
            htm_2 = loan_htm_parse.xpath(
                "//div[@class='lendtable']/dl/dd[@class='wd140 fl']")
            htm_3 = loan_htm_parse.xpath(
                "//div[@class='lendtable']/dl/dd[@class='wd130 fl pdl10']")
            htm_4 = loan_htm_parse.xpath(
                "//div[@class='lendtable']/dl/dd[@class='wd130 fl']")

            loan_list = []
            for h1 in htm_1:
                loan_obj = Loan(company_id)
                loan_obj.title = str(
                    h1.xpath("h3/a[@class='fl']/text()")[0].encode("utf-8"))
                loan_obj.href = str(
                    h1.xpath("h3/a[@class='fl']/@href")[0]).replace(":80", "")
                loan_obj.original_id = loan_obj.href.split("=")[1]
                loan_list.append(loan_obj)
            for index, h2 in enumerate(htm_2):
                loan_list[index].borrow_amount = str(
                    h2.xpath("p[@class='colorCb mt10']/text()")[0].encode(
                        "utf-8")).replace("¥", "").replace(",", "")
                loan_list[index].rate = str(
                    h2.xpath("p[@class='colorE6']/span/text()")[0]).replace(
                        "%", "")
            for index, h3 in enumerate(htm_3):
                loan_list[index].period = str(
                    h3.xpath("p/span/text()")[0].encode("utf-8"))
                loan_list[index].period_unit = loan_obj.PERIOD_UNIT_MONTH
                loan_list[index].repayment = str(
                    h3.xpath("p[@class='']/text()")[0].encode("utf-8"))
            for index, h4 in enumerate(htm_4):
                loan_list[index].schedule = str(
                    h4.xpath("p/span/em/text()")[0]).strip().replace("%", "")

            # 去掉已经满标的
            new_list = [i for i in loan_list if i.schedule != "100"]

            for loan in new_list:
                online_ids_set.add(loan.original_id)
                if loan.original_id in db_ids_set:
                    update_ids_set.add(loan.original_id)

                    loan.db_update(db)
                else:
                    new_ids_set.add(loan.original_id)

                    loan.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

            time.sleep(5)

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj_off = Loan(company_id)
            loan_obj_off.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #10
0
def crawl():
    company_id = 24
    url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = loads(htm, encoding="utf-8")
        loans = htm_obj["list"]
        if len(loans) > 0:
            for loan in loans:
                if str(loan["bID_SCHD"]) == "100":
                    #放弃已经结束的
                    continue
                original_id = loan["lN_NO"]
                href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = loan["lN_NM"]
                    loan_obj.borrow_amount = loan["lN_AMT"]
                    loan_obj.rate = loan["lN_RATE"]
                    loan_obj.period = loan["lN_TERM"]
                    loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"]
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.repayment = loan["pAY_METH_DESC"]

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #11
0
def crawl():
    company_id = 9
    url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1"
    request_headers = {'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        loan_num = loans_json["totalCount"]
        if loans_json and loan_num:
            for i in range(0, loan_num):
                original_id = str(loans_json["data"][i]["productId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id
                    loan_obj.title = loans_json["data"][i]["productNameDisplay"]
                    loan_obj.rate = str(float(loans_json["data"][i]["interestRate"]) * 100)
                    period = str(loans_json["data"][i]["investPeriodDisplay"].encode("utf-8"))
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.repayment = loans_json["data"][i]["collectionModeDisplay"]
                    loan_obj.borrow_amount = str(int(loans_json["data"][i]["price"]))
                    loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #12
0
def crawl():
    company_id = 22
    url = "https://www.weidai.com.cn/?m=Biao&t=today&pageIndex=1&pageSize=8&sortField=b.verify_time&sortOrder=desc&data=null"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = loads(htm, encoding="utf-8")
        loans = htm_obj["data"]
        if len(loans) > 0:
            for loan in loans:
                if str(loan["borrow_account_scale"]) == "100.00":
                    #放弃已经结束的
                    continue
                original_id = loan["uid"]
                href = "https://www.weidai.com.cn/page/borinfo.html?uid=" + original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan["borrow_account_scale"])
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = loan["name"]
                    loan_obj.borrow_amount = loan["account"]
                    loan_obj.rate = loan["borrow_apr"]
                    loan_obj.period = loan["borrow_period"]
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan["borrow_account_scale"])
                    loan_obj.cast = loan["borrow_account_yes"]
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #13
0
def crawl():
    company_id = 18
    url = "https://www.my089.com/Loan/default.aspx?pid=1"
    request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")
        for p in range(1, int(page) + 1):
            url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath("//div[@class='Loan_box']/dl[@class='LoanList']")
            if len(loans) > 0:
                for loan in loans:
                    if str(loan.xpath("dd[last()]/p/span/text()")[0]) == "100%":
                        continue
                    href = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0])
                    original_id = href.split("=")[1].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "https://www.my089.com/Loan/" + href
                        loan_obj.title = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")[0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(loan.xpath("dd[3]/span/text()")[0].encode("UTF-8")).strip().replace("%/年", "")
                        loan_obj.period = str(loan.xpath("dd[5]/span/text()")[0].encode("UTF-8")).strip().replace(" ", "")
                        s = str(loan.xpath("dd[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("个", "")
                        loan_obj.period_unit = s.split("/")[0].strip()
                        loan_obj.repayment = s.split("/")[1].strip()
                        loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "")
                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #14
0
def crawl():
    company_id = 17
    #url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512"
    s = int(time.time() * 1000)
    e = s + 1
    url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d"
    url_1 = url % (0, s, e)
    request_headers = {
        'Referee': "http://www.touna.cn/invest-list.html",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url_1, request_headers)
        htm_json = loads(htm, encoding="UTF-8")
        page_count = htm_json["result"]["pages"]["count"]
        page = page_count / 10
        if page_count % 10 > 0:
            page += 1
        if page > 0:
            for p in range(0, page):
                # 重新计算当前时间
                s = int(time.time() * 1000)
                e = s + 1
                page_url = url % (p, s, e)
                loan_htm = download_page(page_url, request_headers)
                loans_json = loads(loan_htm, encoding="UTF-8")
                loans = loans_json["result"]["list"]
                for loan in loans:
                    original_id = str(loan["id"])
                    if original_id:
                        online_ids_set.add(original_id)
                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int(
                            original_id)
                        loan_obj.title = loan["name"]
                        loan_obj.borrow_amount = loan["account"]
                        loan_obj.rate = loan["apr"]
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.repayment = loan["style_name"]
                        period = str(loan["time_limit_name"].encode("utf-8"))
                        if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                            loan_obj.period = period.replace(
                                loan_obj.PERIOD_UNIT_DAY, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        else:
                            loan_obj.period = period.replace("个", "").replace(
                                loan_obj.PERIOD_UNIT_MONTH, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

            # db - 新抓取的 = 就是要下线的
            off_ids_set = db_ids_set - online_ids_set
            if off_ids_set:
                loan_obj = Loan(company_id)
                loan_obj.db_offline(db, off_ids_set)
                logger.info("company %s crawler loan: offline %s", company_id,
                            len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #15
0
def crawl():
    company_id = 11
    url = "https://www.tzydb.com"
    request_headers = {'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//div[@id='proList']/ul[@class='item_li']")
        if len(loans) > 0:
            for loan in loans:
                schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip()
                if schedule == "100%" or schedule == "100.0%":
                    #放弃已经结束的
                    continue
                # link = https://www.tzydb.com/boot/lookup/971,1017
                a_script = str(loan.xpath("li/div[1]/div[1]/div/a/@href")[0].encode("utf-8"))
                o_id = ID_RE.findall(a_script)[0]
                original_id = o_id.replace(",", "-")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)
                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.tzydb.com/boot/lookup/" + o_id
                    loan_obj.title = str(loan.xpath("li/div[1]/div[1]/div/a/text()")[0].encode("utf-8"))
                    loan_obj.borrow_amount = str(loan.xpath("li/div[2]/div[1]/span/text()")[0].encode("utf-8")).strip()\
                        .replace(" ", "").replace(",", "")
                    loan_obj.period = str(loan.xpath("li/div[2]/div[3]/span/text()")[0].encode("UTF-8")).strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.rate = str(loan.xpath("li/div[2]/div[2]/span/text()")[0]).strip().replace("%", "")
                    loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #16
0
def crawl():
    company_id = 19
    url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0"
    request_headers = {'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")

        if loans_json["list"]:
            for i in range(0, len(loans_json["list"])):
                if int(loans_json["list"][i]["status"]) != 1:
                    continue
                original_id = str(loans_json["list"][i]["borrowId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id
                    loan_obj.title = loans_json["list"][i]["name"]
                    loan_obj.rate = str(loans_json["list"][i]["apr"])
                    loan_obj.period = str(loans_json["list"][i]["totalPeriod"])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.borrow_amount = str(int(loans_json["list"][i]["account"]))
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #17
0
def crawl():
    company_id = 2
    url = "http://www.ppdai.com/lend/12_s1_p1"
    request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='fen_ye_nav']/table/tr/td[last()]/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")

        for p in range(1, int(page) + 1):
            url = "http://www.ppdai.com/lend/12_s1_p" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath("//div[@class='lend_nav']/table/tr")
            if len(loans) > 0:
                for loan in loans:
                    if lxml.html.tostring(loan).find("tit_nav") > 0:
                        continue
                    href = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@href")[0])
                    original_id = href.split("/")[2].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1]
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id)
                        loan_obj.original_id = original_id
                        loan_obj.href = "http://www.ppdai.com" + href
                        loan_obj.title = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@title")[0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(loan.xpath("td[4]/text()")[0]).strip().replace("%", "")
                        period = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "")
                        if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                            loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        else:
                            loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                        loan_obj.schedule = float(str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1])

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #18
0
def crawl():
    company_id = 7
    url = "http://www.jimubox.com/Project/List?status=1"
    request_headers = {'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='row']/div[@class='span3 project-card']")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/@href")[0])
                if not href.find("Index") > 0:
                    continue
                original_id = href.split("/")[3].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.jimubox.com" + href
                    loan_obj.title = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/text()")[0].encode("utf-8"))
                    loan_obj.description = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\
                        .strip() + "0000"
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")

                    rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    if rate.find("+") > 0:
                        rate_list = rate.split("+")
                        loan_obj.rate = str(float(rate_list[0]) + float(rate_list[1]))
                    else:
                        loan_obj.rate = rate
                    loan_obj.repayment = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h6/span/text()")[0].encode("utf-8"))
                    loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #19
0
def crawl():
    company_id = 26
    url = "http://www.longlongweb.com/invests"
    request_headers = {"Referee": "http://www.longlongweb.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="utf-8")
        loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl")
        if len(loans) > 0:
            for loan in loans:
                if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0:
                    continue

                href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0])
                original_id = href.split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = (
                        str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )

                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = (
                        str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8"))
                    loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).replace(
                        "%", ""
                    )
                    loan_obj.borrow_amount = (
                        str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = (
                        str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0].encode("utf-8"))
                        .replace(",", "")
                        .replace("¥", "")
                        .strip()
                    )
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.repayment = (
                        str(
                            loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0].encode(
                                "utf-8"
                            )
                        )
                        .strip()
                        .replace("还款方式:", "")
                    )

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))
    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #20
0
def crawl():
    company_id = 10
    url = "https://www.xinhehui.com/Financing/Invest/ajaxplist"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//table[@class='ui-record-table percentTable mt10']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if loan.xpath("td[last()]/a/@href")[0].encode("utf-8") == "javascript:;":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8"))
                original_id = href.split("id%3D")[1].encode("utf-8").strip()
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.xinhehui.com" + href
                    title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip()
                    if loan.xpath("td[1]/p[1]/a/em"):
                        title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip()
                    else:
                        title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.title = title_1 + title_2
                    borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "")
                    if borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000
                    else:
                        loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", ""))

                    if loan.xpath("td[4]/span"):
                        period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip()
                    else:
                        period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "")
                    loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #21
0
def crawl():
    company_id = 21
    url = "http://www.id68.cn/invest/index/borrow_status/9/p/1.html"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//ul[@class='ideal_con']/li")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]) == "100.00%":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("table/tr[1]/td[1]/a/@href")[0].encode("utf-8"))
                original_id = href.replace(".html", "").split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = REFEREE + href
                    loan_obj.title = str(loan.xpath("table/tr[1]/td[1]/a/@title")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("table/tr[2]/td[last()]/span/text()")[0].encode("utf-8"))\
                        .strip().replace(" ", "").replace(",", "")
                    loan_obj.repayment = str(loan.xpath("table/tr[2]/td[4]/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.rate = str(loan.xpath("table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("table/tr[2]/td[3]/span/text()")[0].encode("utf-8")).strip()\
                        .replace(" ", "").replace("个月", "")
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #22
0
def crawl():
    company_id = 13
    url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress="
    request_headers = {
        'Referee': "http://www.yirendai.com/loan/list/1",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//ul[@class='bidList']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath(
                        "div[2]/div[2]/div/div[@class='bid_empty_errortip']"):
                    continue
                href = str(loan.xpath("div[2]/div/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(
                        float(loan_obj.cast) / float(loan_obj.borrow_amount) *
                        100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.yirendai.com" + href
                    loan_obj.title = str(
                        loan.xpath("div[2]/div/h3/a/text()")[0].encode(
                            "utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.rate = str(
                        loan.xpath("div[2]/div/div[3]/h4/span/text()")
                        [0].encode("utf-8")).strip()
                    loan_obj.period = str(
                        loan.xpath("div[2]/div/div[4]/h4/span/text()")
                        [0].encode("utf-8")).strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(
                        float(loan_obj.cast) / float(loan_obj.borrow_amount) *
                        100)

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #23
0
def crawl():
    company_id = 26
    url = "http://www.longlongweb.com/invests"
    request_headers = {'Referee': "http://www.longlongweb.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="utf-8")
        loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl")
        if len(loans) > 0:
            for loan in loans:
                if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0:
                    continue

                href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0])
                original_id = href.split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\
                        .replace(",", "").replace("¥", "").strip()

                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0]
                                        .encode("utf-8")).replace(",", "").replace("¥", "").strip()
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8"))
                    loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8"))\
                        .replace("%", "")
                    loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\
                        .replace(",", "").replace("¥", "").strip()
                    loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0]
                                        .encode("utf-8")).replace(",", "").replace("¥", "").strip()
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.repayment = str(loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0]
                                             .encode("utf-8")).strip().replace("还款方式:", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))
    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #24
0
def crawl():
    company_id = 20
    url = "http://www.xiaomabank.com/finance.do"
    request_headers = {"Referee": "http://www.xiaomabank.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = (
                        str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8"))
                        .replace("width:", "")
                        .strip()
                        .replace("%;", "")
                    )
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.xiaomabank.com/" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = (
                        str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "")
                    )
                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = (
                        str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8"))
                        .replace("width:", "")
                        .strip()
                        .replace("%;", "")
                    )

                    # 注意这里页面返回的gzip压缩后的,需要解压
                    resp = urllib2.urlopen(loan_obj.href)
                    respInfo = resp.info()
                    if ("Content-Encoding" in respInfo) and (respInfo["Content-Encoding"] == "gzip"):
                        respHtml = zlib.decompress(resp.read(), 16 + zlib.MAX_WBITS)
                        info_htm_parse = parse_html(respHtml, encoding="utf-8")
                        loan_obj.repayment = str(
                            info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8")
                        )

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #25
0
def crawl():
    company_id = 17
    # url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=1&size=10&subtime=1411910662511&_=1411910662512"
    s = int(time.time() * 1000)
    e = s + 1
    url = "http://www.touna.cn/borrow.do?method=list&borrowType=0&creditType=&timeLimit=&keyType=0&page=%d&size=10&subtime=%d&_=%d"
    url_1 = url % (0, s, e)
    request_headers = {"Referee": "http://www.touna.cn/invest-list.html", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        htm = download_page(url_1, request_headers)
        htm_json = loads(htm, encoding="UTF-8")
        page_count = htm_json["result"]["pages"]["count"]
        page = page_count / 10
        if page_count % 10 > 0:
            page += 1
        if page > 0:
            for p in range(0, page):
                # 重新计算当前时间
                s = int(time.time() * 1000)
                e = s + 1
                page_url = url % (p, s, e)
                loan_htm = download_page(page_url, request_headers)
                loans_json = loads(loan_htm, encoding="UTF-8")
                loans = loans_json["result"]["list"]
                for loan in loans:
                    original_id = str(loan["id"])
                    if original_id:
                        online_ids_set.add(original_id)
                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "http://www.touna.cn/invest-page.html?id=%d" % int(original_id)
                        loan_obj.title = loan["name"]
                        loan_obj.borrow_amount = loan["account"]
                        loan_obj.rate = loan["apr"]
                        loan_obj.schedule = str(loan["score"])
                        loan_obj.repayment = loan["style_name"]
                        period = str(loan["time_limit_name"].encode("utf-8"))
                        if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                            loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        else:
                            loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

            # db - 新抓取的 = 就是要下线的
            off_ids_set = db_ids_set - online_ids_set
            if off_ids_set:
                loan_obj = Loan(company_id)
                loan_obj.db_offline(db, off_ids_set)
                logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #26
0
def crawl():
    company_id = 14
    url = "http://www.licaifan.com"
    request_headers = {'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath(
            "//ul[@class='main-list tab-con2']/li[1]/table/tr")
        if len(loans) > 0:
            # 这里注意第一行是表单标题,不需要,所以从1开始
            for i in range(1, len(loans)):
                if str(loans[i].xpath("td[last()]/a/text()")[0].encode(
                        "utf-8")) == "投资满额":
                    continue
                href = str(loans[i].xpath("td[1]/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.licaifan.com" + href
                    loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()")
                                         [0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(
                            loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = str(loans[i].xpath("td[2]/text()")
                                        [0].encode("utf-8")).strip().replace(
                                            "%", "")
                    period = str(loans[i].xpath("td[4]/text()")[0].encode(
                        "utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #27
0
def crawl():
    company_id = 23
    url = "https://member.niwodai.com/xiangmu/"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm, encoding="utf-8")
        loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\
            .replace("共", "").replace("个标", "").strip())
        if loan_size > 0:
            page = loan_size / 10
            if loan_size % 10 > 0:
                page += 1
            for p in range(1, page + 1):
                page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % (
                    p, loan_size)
                page_html = download_page(page_url, request_headers)
                page_obj = parse_html(page_html, encoding="utf-8")
                loans = page_obj.xpath(
                    "//div[@class='biaoList']/table/tbody/tr")
                for loan in loans:
                    if lxml.html.tostring(loan).find("<th>") > 0:
                        continue
                    href = str(loan.xpath("td[1]/a/@href")[0])
                    original_id = href.replace(".html", "").split("/")[2]
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(
                            loan.xpath("td[5]/text()")[0].encode(
                                "utf-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = REFEREE + href
                        loan_obj.title = str(
                            loan.xpath("td[1]/a/text()")[0].encode(
                                "utf-8")).strip()
                        loan_obj.borrow_amount = str(
                            loan.xpath("td[4]/em/text()")[0].encode(
                                "utf-8")).strip().replace(",", "")
                        loan_obj.rate = str(
                            loan.xpath("td[2]/em/text()")[0].encode(
                                "utf-8")).strip().replace("%", "")
                        loan_obj.period = str(
                            loan.xpath("td[3]/em/text()")[0].encode(
                                "utf-8")).strip()
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        loan_obj.schedule = str(
                            loan.xpath("td[5]/text()")[0].encode(
                                "utf-8")).strip().replace("%", "")

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #28
0
def crawl():
    company_id = 20
    url = "http://www.xiaomabank.com/finance.do"
    request_headers = {'Referee': "http://www.xiaomabank.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.xiaomabank.com/" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "")
                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "")

                    # 注意这里页面返回的gzip压缩后的,需要解压
                    resp = urllib2.urlopen(loan_obj.href)
                    respInfo = resp.info()
                    if(("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")):
                        respHtml = zlib.decompress(resp.read(), 16+zlib.MAX_WBITS)
                        info_htm_parse = parse_html(respHtml, encoding="utf-8")
                        loan_obj.repayment = str(info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8"))

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #29
0
def crawl():
    company_id = 19
    url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0"
    request_headers = {
        'Referee': "https://www.qian360.com/tl/select.html",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")

        if loans_json["list"]:
            for i in range(0, len(loans_json["list"])):
                if int(loans_json["list"][i]["status"]) != 1:
                    continue
                original_id = str(loans_json["list"][i]["borrowId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(
                        int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id
                    loan_obj.title = loans_json["list"][i]["name"]
                    loan_obj.rate = str(loans_json["list"][i]["apr"])
                    loan_obj.period = str(loans_json["list"][i]["totalPeriod"])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.borrow_amount = str(
                        int(loans_json["list"][i]["account"]))
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(
                        int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #30
0
def crawl():
    company_id = 25
    url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \
          "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \
          "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page((url % 1), request_headers)
        obj = loads(htm, encoding="utf-8")
        total = int(obj["total"])
        if total > 0:
            page = total / 5
            if total % 5 > 0:
                page += 1
            for p in range(1, page+1):
                htm = download_page((url % p), request_headers)
                htm_obj = loads(htm, encoding="utf-8")
                loans = htm_obj["projectList"]
                for loan in loans:
                    original_id = loan["ID"]
                    href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = href
                        loan_obj.title = loan["Title"]
                        loan_obj.borrow_amount = loan["TotalAmount"]
                        loan_obj.rate = loan["YearRate"]
                        loan_obj.period = loan["Deadline"]
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.repayment = loan["RepaymentTypeDesc"]

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #31
0
def crawl():
    company_id = 14
    url = "http://www.licaifan.com"
    request_headers = {'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//ul[@class='main-list tab-con2']/li[1]/table/tr")
        if len(loans) > 0:
            # 这里注意第一行是表单标题,不需要,所以从1开始
            for i in range(1, len(loans)):
                if str(loans[i].xpath("td[last()]/a/text()")[0].encode("utf-8")) == "投资满额":
                    continue
                href = str(loans[i].xpath("td[1]/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.licaifan.com" + href
                    loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = str(loans[i].xpath("td[2]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    period = str(loans[i].xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #32
0
def crawl():
    company_id = 25
    url = "http://www.tuandai.com/pages/ajax/invest_list.ashx?Cmd=GetInvest_List" \
          "&RepaymentTypeId=0&pagesize=5&pageindex=%s&type=3&status=1&DeadLine=0" \
          "&beginDeadLine=0&endDeadLine=0&rate=0&beginRate=0&endRate=0&strkey=&orderby=0"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page((url % 1), request_headers)
        obj = loads(htm, encoding="utf-8")
        total = int(obj["total"])
        if total > 0:
            page = total / 5
            if total % 5 > 0:
                page += 1
            for p in range(1, page + 1):
                htm = download_page((url % p), request_headers)
                htm_obj = loads(htm, encoding="utf-8")
                loans = htm_obj["projectList"]
                for loan in loans:
                    original_id = loan["ID"]
                    href = "http://www.tuandai.com/pages/invest/jing_detail.aspx?id=" + original_id
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = href
                        loan_obj.title = loan["Title"]
                        loan_obj.borrow_amount = loan["TotalAmount"]
                        loan_obj.rate = loan["YearRate"]
                        loan_obj.period = loan["Deadline"]
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                        loan_obj.schedule = str(loan["ProgressPercent"])
                        loan_obj.cast = loan["ProgressAmount"]
                        loan_obj.repayment = loan["RepaymentTypeDesc"]

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #33
0
def crawl():
    company_id = 15
    url = "https://www.iqianbang.com/invest"
    request_headers = {"Referee": "https://www.iqianbang.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("td[7]/text()")[0].encode("utf-8")).strip() != "融资中":
                    continue
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("-")[3].replace(".shtml", "")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.iqianbang.com" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = (
                        str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace(",", "").replace("元", "")
                    )
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = (
                        str(loan.xpath("td[2]/span/span/text()")[0].encode("utf-8")).strip().replace("%", "")
                    )
                    period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")

                    # 这里需要进入详情页
                    loan_info_htm = download_page(loan_obj.href, headers={"Referee": url, "User-Agent": DEFAULT_UA})
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8")
                    loan_obj.repayment = str(
                        loan_info_htm_parse.xpath("//div[@class='inright']/table[@class='idetable']")[0]
                        .xpath("tr[2]/td[2]/span/text()")[0]
                        .encode("utf-8")
                    ).strip()

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #34
0
def crawl():
    company_id = 13
    url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress="
    request_headers = {'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//ul[@class='bidList']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath("div[2]/div[2]/div/div[@class='bid_empty_errortip']"):
                    continue
                href = str(loan.xpath("div[2]/div/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.yirendai.com" + href
                    loan_obj.title = str(loan.xpath("div[2]/div/h3/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.rate = str(loan.xpath("div[2]/div/div[3]/h4/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.period = str(loan.xpath("div[2]/div/div[4]/h4/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #35
0
def crawl():
    company_id = 3
    url = "http://www.91wangcai.com/invest/index.html"
    request_headers = {
        'Referee': "http://www.91wangcai.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="gb2312")
        loans = loan_htm_parse.xpath("//div[@class='proBoxNew']")

        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='hd']/a/@href")[0])
                original_id = href.split(".")[0].split("/")[2].encode("utf-8")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.91wangcai.com" + href
                    loan_obj.title = autodecode(
                        str(
                            loan.xpath("div[@class='hd']/a/text()")[0].encode(
                                "gb2312"))).encode("utf-8")

                    loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("¥", "")

                    loan_obj.rate = str(
                        loan.xpath(
                            "div[@class='bd']/table/tr[1]/td[2]/em/text()")
                        [0]).strip().replace("%", "")

                    loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \
                        .replace("<em>", "").replace("</em>", "")
                    html_parser = HTMLParser.HTMLParser()
                    period = html_parser.unescape(loan_period_text).encode(
                        "utf-8").strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("还款方式:", "")

                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #36
0
def crawl():
    company_id = 7
    url = "http://www.jimubox.com/Project/List?status=1"
    request_headers = {
        'Referee': "http://www.jimubox.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath(
            "//div[@class='row']/div[@class='span3 project-card']")
        if len(loans) > 0:
            for loan in loans:
                href = str(
                    loan.xpath(
                        "div[@class='project-item']/div[@class='project-item-content']/h4/a/@href"
                    )[0])
                if not href.find("Index") > 0:
                    continue
                original_id = href.split("/")[3].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.jimubox.com" + href
                    loan_obj.title = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/h4/a/text()"
                        )[0].encode("utf-8"))
                    loan_obj.description = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()"
                        )[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\
                        .strip() + "0000"
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")

                    rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    if rate.find("+") > 0:
                        rate_list = rate.split("+")
                        loan_obj.rate = str(
                            float(rate_list[0]) + float(rate_list[1]))
                    else:
                        loan_obj.rate = rate
                    loan_obj.repayment = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/h6/span/text()"
                        )[0].encode("utf-8"))
                    loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #37
0
def crawl():
    company_id = 10
    url = "https://www.xinhehui.com/Financing/Invest/ajaxplist"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath(
            "//table[@class='ui-record-table percentTable mt10']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if loan.xpath("td[last()]/a/@href")[0].encode(
                        "utf-8") == "javascript:;":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8"))
                original_id = href.split("id%3D")[1].encode("utf-8").strip()
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(
                            loan.xpath("td[7]/div/a/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.xinhehui.com" + href
                    title_1 = str(
                        loan.xpath("td[1]/p[1]/a/text()")[0].encode(
                            "utf-8")).strip()
                    if loan.xpath("td[1]/p[1]/a/em"):
                        title_2 = str(
                            loan.xpath("td[1]/p[1]/a/em/text()")[0].encode(
                                "utf-8")).strip()
                    else:
                        title_2 = str(
                            loan.xpath("td[1]/p[1]/a/span/text()")[0].encode(
                                "utf-8")).strip()
                    loan_obj.title = title_1 + title_2
                    borrow_amount = str(
                        loan.xpath("td[2]/span/text()")[0].encode(
                            "utf-8")).strip().replace(" ", "")
                    if borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = float(
                            borrow_amount.replace("万", "")) * 10000
                    else:
                        loan_obj.borrow_amount = float(
                            borrow_amount.replace("元", "").replace(",", ""))

                    if loan.xpath("td[4]/span"):
                        period = str(
                            loan.xpath("td[4]/span/@title")[0].encode(
                                "UTF-8")).strip()
                    else:
                        period = str(
                            loan.xpath("td[4]/text()")[0].encode(
                                "UTF-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.rate = str(
                        loan.xpath("td[3]/p/text()")[0]).strip().replace(
                            "%", "")
                    loan_obj.repayment = str(
                        loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(
                            loan.xpath("td[7]/div/a/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Exemple #38
0
def crawl():
    company_id = 24
    url = "http://www.he-pai.cn/investmentDetail/investmentDetails/ajaxInvmentList.do?pageNo=1"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = loads(htm, encoding="utf-8")
        loans = htm_obj["list"]
        if len(loans) > 0:
            for loan in loans:
                if str(loan["bID_SCHD"]) == "100":
                    #放弃已经结束的
                    continue
                original_id = loan["lN_NO"]
                href = "http://www.he-pai.cn/investmentDetail/memberCenter/transferView.do?ln_no=" + original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = loan["lN_NM"]
                    loan_obj.borrow_amount = loan["lN_AMT"]
                    loan_obj.rate = loan["lN_RATE"]
                    loan_obj.period = loan["lN_TERM"]
                    loan_obj.period_unit = loan["lN_TERM_UNIT_DESC"]
                    loan_obj.schedule = str(loan["bID_SCHD"])
                    loan_obj.repayment = loan["pAY_METH_DESC"]

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())