Beispiel #1
0
def crawler(sql):
    db = get_db_engine()
    items = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for item in items:
        shop_id = item[0]
        shop_type = item[1]
        item_id = item[2]
        url = item[3]

        try:
            htm = get_item_htm(item_id, url, db)
            if shop_type == 1:
                htm_obj = parse_html(htm, encoding='gb18030')
                discount_url = htm_obj.xpath("//div[@id='promote']/@data-default")
                if discount_url and len(discount_url) > 0:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(discount_url[0], item_headers)
                    if disc_content.strip():
                        disc_obj = parse_html(disc_content, encoding='gb18030')
                        content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip()
                        dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip()
                        st = dates.encode('utf-8').replace("--","—").split("—")
                        start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')
                        end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')

                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                                   shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0])
                        logger.info("taobao shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
                else:
                    logger.warning("taobao shop %s:%s not discount.", shop_id, url)
            elif shop_type == 2:
                d_url = get_val(htm, "initApi")
                if d_url:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(d_url, item_headers)
                    cjson = loads(disc_content.decode('gb18030').encode('utf8'))
                    shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm']
                    if shop_prom:
                        st = int(shop_prom['startTime'])/1000
                        et = int(shop_prom['endTime'])/1000
                        start_time = time.strftime("%Y-%m-%d", time.localtime(st))
                        end_time = time.strftime("%Y-%m-%d", time.localtime(et))
                        content = shop_prom['promPlan'][0]['msg']
                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                            shop_id, content.encode('utf-8'), start_time, end_time, d_url)
                        logger.info("tmall shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
        except:
            logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())
Beispiel #2
0
def crawl_one_shop(shop, db):

    shop_id = shop[0]
    shop_type = shop[1]
    shop_url = shop[2]
    first_item_id = shop[3]
    item_html_id = shop[4]
    item_html = shop[5]

    urls = {'shop_url': shop_url}

    try:
        the_shop = ShopExtendInfo(db, shop_id)
        result = False

        logger.info("begin get shop extend info. shop id: %d. shop type: %d." %
                    (shop_id, shop_type))
        logger.debug("first item id: %d. item html id: %d. html length: %d" %
                     (first_item_id, item_html_id, len(item_html)))

        url = url_reg.search(item_html).group(1).encode('utf-8')
        urls['shop_rate_url'] = url

        shop_html = download_with_referer(url, shop_url)

        if shop_html:
            logger.debug(
                "download shop extend html. shop_id: %d, item_id: %d, url: %s. length: %d"
                % (shop_id, first_item_id, url, len(shop_html)))

            shop_html_obj = parse_html(shop_html, 'gbk')
            item_html_obj = parse_html(item_html, 'gbk')
            if shop_type == 1:
                get_taobao_shop_extend_info(the_shop, shop_html_obj,
                                            item_html_obj, urls)
            else:
                get_tmall_shop_extend_info(the_shop, shop_html_obj,
                                           item_html_obj, urls)

            the_shop.save()
            result = True
        else:
            logger.error(
                "download shop extend html error. shop_id: %d, item_id: %d, url: %s."
                % (shop_id, first_item_id, url))

        if result:
            logger.info(
                "success get shop extend info. shop_id: %d. type: %d." %
                (shop_id, shop_type))
        else:
            logger.error("fail get shop extend info. shop_id: %d.  type: %d." %
                         (shop_id, shop_type))

    except:
        logger.error(
            "update_shop_extend_info failed. shop_id: %s. type: %d, error info: %s"
            % (shop_id, shop_type, traceback.format_exc()))
def get_tmall_shop_collected_count(the_shop, shop_html_obj, item_html_obj,  urls):
    """获取天猫店被关注数目"""
    try:
        is_done = False

        if not is_done:
            collected_count = shop_html_obj.xpath(u"//em[@class='j_CollectBrandNum']/text()")
            if collected_count and collected_count[0].isdigit():
                the_shop.favorited_user_count = int(collected_count[0])
                is_done = True

        if not is_done:
            shop_home_html = download_with_referer(urls['shop_url'], None)
            shop_home_obj = parse_html(shop_home_html, 'gbk')
            collected_count = shop_home_obj.xpath(u"//em[@class='j_CollectBrandNum']/text()")
            if collected_count and collected_count[0].isdigit():
                the_shop.favorited_user_count = int(collected_count[0])
                is_done = True

        if not is_done:
            collected_count = item_html_obj.xpath(u"//em[@class='j_CollectBrandNum']/text()")
            if collected_count and collected_count[0].isdigit():
                the_shop.favorited_user_count = int(collected_count[0])
                is_done = True

        if not is_done:
            logger.error("get shop collected count failed. shop_id: %d." % the_shop.get_shop_id())
    except:
        logger.error("get shop favorite count failed. shop_id: %s. error info: %s" % (the_shop.get_shop_id(),  traceback.format_exc()))
Beispiel #4
0
    def crawl_page(self, url):
        retry_count = 1
        while retry_count >= 0:
            try:
                data = self.download_page(url)
                if not data:
                    logger.warn("crawl %s %s failed", self.id, url)
                    return None, None, None, None
                if FLAGS.dump:
                    dumpf = open("%s_%s" % (self.id, url.replace('/', '_').replace(':','_').replace('&','_').replace('?','_')), 'w')
                    dumpf.write(data)
                    dumpf.close()
                if FLAGS.debug_parser:
                    import pdb; pdb.set_trace()
                if data.find(u"没有找到相应的店铺信息".encode('gbk')) > 0:
                    logger.warn("Shop %s is offline %s", self.id, self.url)
                    raise ShopOfflineException(data)

                html_obj = parse_html(data, encoding="gb18030")

                self.level_img = html_obj.xpath("//img[@class='rank']/@src")
                self.nick_url = html_obj.xpath("//a[@class='shop-name']/@href")
                if not self.nick_url:
                    self.nick_url = html_obj.xpath("//div[@id='shop-info']//a/@href")

                result = html_obj.xpath("//div[@id='anchor']//div[@class='search-result']//span/text()")
                items = html_obj.xpath("//div[@id='anchor']//div[@class='item']")
                pages = html_obj.xpath("//div[@id='anchor']//div[@class='pagination']/a[@class='J_SearchAsync']/@href")
                if not result:
                    result = ITEM_NUMBER_RE.findall(data)
                    if result and not items:
                        items = html_obj.xpath("//ul[@class='shop-list']//div[@class='item']")
                if not result:
                    result = html_obj.xpath("//div[@id='J_ShopSearchResult']//div[@class='search-result']//span/text()")
                    items = html_obj.xpath("//div[@id='J_ShopSearchResult']//dl[contains(@class, 'item')]")
                    pages = html_obj.xpath("//div[@id='J_ShopSearchResult']//div[@class='pagination']/a[@class='J_SearchAsync']/@href")
                if not result:
                    # pageLen = ['1/107']
                    pageLen = html_obj.xpath("//p[@class='ui-page-s']//b[@class='ui-page-s-len']/text()")
                    items = html_obj.xpath("//div[@class='J_TItems']//dl[contains(@class, 'item')]")
                    c = 0
                    if "/" in pageLen[0]:
                        c = int(pageLen[0].split("/")[1].strip()) * len(items)
                    else:
                        c = int(pageLen[0].strip()) * len(items)
                    result.append(str(c))
                    pages = html_obj.xpath("//div[@class='J_TItems']//div[@class='pagination']/a[@class='J_SearchAsync']/@href")

                if not result and not items and not pages:
                    logger.warn("crawl %s %s -- 0 items found, page len %s", self.id, url, len(data))
                    if retry_count > 0 and len(data) < 1024:
                        retry_count -= 1
                        time.sleep(1.0)
                        continue
                return result, items, pages, data
            except ShopOfflineException, e:
                raise e
            except BannedException, e:
                raise e
def crawl_one_shop(shop, db):

    shop_id = shop[0]
    shop_type = shop[1]
    shop_url = shop[2]
    first_item_id = shop[3]
    item_html_id = shop[4]
    item_html = shop[5]

    urls = {'shop_url': shop_url}

    try:
        the_shop = ShopExtendInfo(db, shop_id)
        result = False

        logger.info("begin get shop extend info. shop id: %d. shop type: %d." % (shop_id, shop_type))
        logger.debug("first item id: %d. item html id: %d. html length: %d" % (first_item_id, item_html_id, len(item_html)))

        url = url_reg.search(item_html).group(1).encode('utf-8')
        urls['shop_rate_url'] = url

        shop_html = download_with_referer(url, shop_url)

        if shop_html:
            logger.debug("download shop extend html. shop_id: %d, item_id: %d, url: %s. length: %d"
                % (shop_id, first_item_id, url, len(shop_html)))

            shop_html_obj = parse_html(shop_html, 'gbk')
            item_html_obj = parse_html(item_html, 'gbk')
            if shop_type == 1:
                get_taobao_shop_extend_info(the_shop, shop_html_obj, item_html_obj, urls)
            else:
                get_tmall_shop_extend_info(the_shop, shop_html_obj, item_html_obj, urls)

            the_shop.save()
            result = True
        else:
            logger.error("download shop extend html error. shop_id: %d, item_id: %d, url: %s." % (shop_id, first_item_id, url))

        if result:
            logger.info("success get shop extend info. shop_id: %d. type: %d." % (shop_id, shop_type))
        else:
            logger.error("fail get shop extend info. shop_id: %d.  type: %d." % (shop_id, shop_type))

    except:
        logger.error("update_shop_extend_info failed. shop_id: %s. type: %d, error info: %s" % (shop_id, shop_type, traceback.format_exc()))
Beispiel #6
0
def crawl_tao123(shops):
    base_url = "http://dianpu.tao123.com/nvzhuang/%s.php"
    end = 22
    for i in range(1, end+1):
        url = base_url % i
        html = download(url)
        html_obj = parse_html(html)
        shops.update(html_obj.xpath("//div[@class='cg_shop_info']//a/@href"))
Beispiel #7
0
def crawl_dirtbshop(shops):
    base_url = "http://dirtbshop.com/list_shop_%s_1_1.html"
    end = 251
    for i in range(1, end+1):
        url = base_url % i
        html = download(url)
        html_obj = parse_html(html)
        import pdb; pdb.set_trace()
        urls = html_obj.xpath("//span[@class='grebtn_in']/a/@href")
Beispiel #8
0
def crawl_tao123(shops):
    base_url = "http://www.meilishuo.com/shop/top/0/%s"
    end = 203
    for i in range(end):
        logger.debug("processing %s", i)
        url = base_url % i
        html = download(url)
        html_obj = parse_html(html)
        shops.update(html_obj.xpath("//div[@class='shop_item']//a/@href"))
Beispiel #9
0
def crawl_tao123(shops):
    for line in open(FLAGS.path):
        try:
            line = line.strip()
            url = "http://www.meilishuo.com%s" % line
            html = download(url)
            html_obj = parse_html(html)
            shop_url = html_obj.xpath("//div[@class='shop_summary']/a/@href")
            logger.debug("processing %s -> %s", line, shop_url)
            shops.update(shop_url)
        except:
            logger.error("processing %s failed", line)
def crawl_tao123(shops):
    for line in open(FLAGS.path):
        try:
            line = line.strip()
            url = "http://www.meilishuo.com%s" % line
            html = download(url)
            html_obj = parse_html(html)
            shop_url = html_obj.xpath("//div[@class='shop_summary']/a/@href")
            logger.debug("processing %s -> %s", line, shop_url)
            shops.update(shop_url)
        except:
            logger.error("processing %s failed", line)
Beispiel #11
0
    def crawl_title(self):
        try:
            self.data = self.crawl_page(self.url)
            if not self.data:
                logger.warn("download %s %s page failed, possible network connection failure", self.item_id, self.num_id)
                return

            # check tmall
            if not self.is_tmall and len(self.data) < 256 and self.url.find('item.taobao.com') > 0 and self.data.find("window.location.href='http://detail.tmall.com/item.htm'+window.location.search") > 0:
                self.data = self.crawl_page(self.url.replace('item.taobao.com', 'detail.tmall.com'))

            if self.check_offline():
                self.is_offline = True

            self.html_obj = parse_html(self.data, encoding="gb18030")

            title = self.html_obj.xpath("//html/head/title/text()")
            if title and title[0].find(u"转卖") > 0:
                self.is_offline = True
            if title:
                self.title = title[0].encode('utf8').replace("-淘宝网", "").replace("-tmall.com天猫", "")

            #tblogo = self.html_obj.xpath("//*[@id='shop-logo']")
            tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']")
            if not tmalllogo:
                tmalllogo = self.html_obj.xpath("//*[@id='simple-logo']")
            if not self.is_tmall and tmalllogo:
                self.is_tmall = True

            self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
            if not len(self.thumbImages):
                try:
                    # try load thumb images for tmall page
                    self.thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")]

                    # taobao @src to @data-src
                    if not len(self.thumbImages):
                        self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src")
                except:
                    logger.warn("No thumbs found %s", self.item_id)
            if self.is_tmall:
                self.cid = get_val(self.data, "categoryId").split('&')[0]
            else:
                self.cid = get_val(self.data, "cid")

            logger.info("Got %s %s html success", self.item_id, self.num_id)
        except:
            logger.error("crawling %s %s unknown exception %s", self.item_id, self.num_id, traceback.format_exc(), extra={'tags':['crawlItemException',]})
            raise
def crawl_main():
    for host in open(FLAGS.path):
        url = "http://%s" % (host.strip())
        try:
            html = download(url)
            #import pdb; pdb.set_trace()
            html_obj = parse_html(html, 'gbk')
            if url.find('tmall.com') > 0:
                shop_url = html_obj.xpath("//h3[@class='shop-title']/a/@href")[0]
                shop_name = html_obj.xpath("//h3[@class='shop-title']/a/text()")[0]
                print shop_url, shop_name.encode('utf8')
            else:
                shop_url = html_obj.xpath("//div[@class='shop-info-simple']/a/@href")[0]
                shop_name = html_obj.xpath("//div[@class='shop-info-simple']/a/text()")[0]
                shop_rank = html_obj.xpath("//span[@class='shop-rank']//img/@src")[0]
                #good_rate = html_obj.xpath("//li[@class='goodrate']/text()")[0]
                print shop_url, shop_name.encode('utf8'), shop_rank
        except KeyboardInterrupt:
            raise
        except:
            logger.warn("processing %s failed, %s", url, traceback.format_exc())
Beispiel #13
0
def get_tmall_shop_collected_count(the_shop, shop_html_obj, item_html_obj,
                                   urls):
    """获取天猫店被关注数目"""
    try:
        is_done = False

        if not is_done:
            collected_count = shop_html_obj.xpath(
                u"//em[@class='j_CollectBrandNum']/text()")
            if collected_count and collected_count[0].isdigit():
                the_shop.favorited_user_count = int(collected_count[0])
                is_done = True

        if not is_done:
            shop_home_html = download_with_referer(urls['shop_url'], None)
            shop_home_obj = parse_html(shop_home_html, 'gbk')
            collected_count = shop_home_obj.xpath(
                u"//em[@class='j_CollectBrandNum']/text()")
            if collected_count and collected_count[0].isdigit():
                the_shop.favorited_user_count = int(collected_count[0])
                is_done = True

        if not is_done:
            collected_count = item_html_obj.xpath(
                u"//em[@class='j_CollectBrandNum']/text()")
            if collected_count and collected_count[0].isdigit():
                the_shop.favorited_user_count = int(collected_count[0])
                is_done = True

        if not is_done:
            logger.error("get shop collected count failed. shop_id: %d." %
                         the_shop.get_shop_id())
    except:
        logger.error(
            "get shop favorite count failed. shop_id: %s. error info: %s" %
            (the_shop.get_shop_id(), traceback.format_exc()))
Beispiel #14
0
def crawl():
    company_id = 18
    url = "https://www.my089.com/Loan/default.aspx?pid=1"
    request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")
        for p in range(1, int(page) + 1):
            url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath("//div[@class='Loan_box']/dl[@class='LoanList']")
            if len(loans) > 0:
                for loan in loans:
                    if str(loan.xpath("dd[last()]/p/span/text()")[0]) == "100%":
                        continue
                    href = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0])
                    original_id = href.split("=")[1].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "https://www.my089.com/Loan/" + href
                        loan_obj.title = str(loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")[0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(loan.xpath("dd[3]/span/text()")[0].encode("UTF-8")).strip().replace("%/年", "")
                        loan_obj.period = str(loan.xpath("dd[5]/span/text()")[0].encode("UTF-8")).strip().replace(" ", "")
                        s = str(loan.xpath("dd[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("个", "")
                        loan_obj.period_unit = s.split("/")[0].strip()
                        loan_obj.repayment = s.split("/")[1].strip()
                        loan_obj.schedule = str(loan.xpath("dd[last()]/p/span/text()")[0].encode("UTF-8")).strip().replace("%", "")
                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #15
0
def crawl_item2(kwargs):
    #signal.signal(signal.SIGINT, signal.SIG_IGN)
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    crawl_path = kwargs['crawl_path']
    server_path = kwargs['server_path']
    org_server_path = kwargs['org_server_path']
    is_remove = kwargs['is_remove']

    item_id = item[0]
    num_id = item[1]
    is_success = False
    crawl_result = ((item_id, {
        'suc1': 0,
        'count1': 0,
        'suc': 0,
        'count': 0
    }), )
    try:
        conn = get_db_engine(**kwargs).connect()
        try:
            items = conn.execute(
                "select html, desc_content from crawl_html where crawl_html.item_id=%s;"
                % item_id)
            result = list(items)
            if len(result) == 1:
                html = result[0][0]
                desc_content = result[0][1]

                html_obj = parse_html(html)
                thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
                if len(thumbImages) == 0:
                    thumbImages = [
                        IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in
                        html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")
                    ]
                    # taobao @src to @data-src
                    if not len(thumbImages):
                        thumbImages = html_obj.xpath(
                            "//ul[@id='J_UlThumb']//img/@data-src")

                if len(thumbImages) == 0:
                    logger.error(
                        "crawl item %s %s not found thumb images html size %s",
                        item_id,
                        num_id,
                        len(html),
                        extra={'tags': [
                            'crawl_thumb_empty',
                        ]})
                    return crawl_result

                r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M | re.S)
                tr = re.compile("(.*)_\d+x\d+\.jpg$")
                desc_thumbs = lazy_desc_thumbs = []
                if desc_content:
                    desc_html = r.subn(r'\2', desc_content)[0]
                    desc_html_obj = parse_html(desc_html)
                    if desc_html_obj is not None:
                        desc_thumbs = desc_html_obj.xpath(
                            "//*[not(@href)]/img[not(@data-ks-lazyload)]/@src")
                        lazy_desc_thumbs = desc_html_obj.xpath(
                            "//*[not(@href)]/img/@data-ks-lazyload")
                else:
                    logger.warn("crawl item %s %s desc content is empty!",
                                item_id,
                                num_id,
                                extra={'tags': [
                                    'crawl_nodesc',
                                ]})

                images = []
                pos = 1
                for url in thumbImages:
                    images.append((tr.sub(r'\1', url), pos, 1))
                    pos += 1
                for url in desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 2))
                        pos += 1
                for url in lazy_desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 3))
                        pos += 1

                logger.debug("crawling %s %s %s", item_id, num_id, images)
                item_crawler = ItemCrawler(item_id, num_id, crawl_path,
                                           server_path, org_server_path,
                                           kwargs['statshost'],
                                           kwargs['statsport'])
                item_crawler.crawl(images, ((710, 10000), ), is_commit, conn,
                                   is_remove)
                is_success = item_crawler.success
                crawl_result = ((item_id, item_crawler.summary), )
        except Exception, e:
            logger.error("crawl item %s %s got exception %s",
                         item_id,
                         num_id,
                         traceback.format_exc(),
                         extra={'tags': [
                             'crawl_exception',
                         ]})
        finally:
            conn.close()
        Statsd.update_stats("guang.crawl.downimgcount",
                            crawl_result[0][1]['suc1'] +
                            crawl_result[0][1]['suc'],
                            host=kwargs['statshost'],
                            port=kwargs['statsport'])
        if is_success:
            logger.info("crawl item %s %s success %s", item_id, num_id,
                        crawl_result)
            Statsd.increment('guang.crawl.itemimg.succ',
                             host=kwargs['statshost'],
                             port=kwargs['statsport'])
        else:
            logger.warn("crawl item %s %s failed %s",
                        item_id,
                        num_id,
                        crawl_result,
                        extra={'tags': [
                            'crawl_failed',
                        ]})
            Statsd.increment('guang.crawl.itemimg.failed',
                             host=kwargs['statshost'],
                             port=kwargs['statsport'])
Beispiel #16
0
def crawl_wzdai():
    url = "https://www.wzdai.com/invest/index.html?status=1&page=1&order=-3"
    request_headers = {
        'Referee': "https://www.wzdai.com",
        'User-Agent': DEFAULT_UA
    }

    company_id = 3

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        pages_obj = htm_obj.xpath(
            "//div[@class='page']/div[@align='center']/span/text()")[0]
        page = int(str(pages_obj.encode("utf-8")).split("条")[1].split("页")[0])
        for p in range(1, page + 1):
            url = "https://www.wzdai.com/invest/index.html?status=1&page=" + str(
                p) + "&order=-3"

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath("//div[@class='invest_box']")
            if len(loans) > 0:
                for loan in loans:
                    href = "https://www.wzdai.com" + str(
                        loan.xpath("h1/a[@class='del']/@href")[0])
                    title = loan.xpath(
                        "h1/a[@class='del']/text()")[0].strip().encode("UTF-8")
                    borrow_amount = str(
                        loan.xpath(
                            "div[@class='invest_box_Info']/div[@class='prize']/span/b/text()"
                        )[0])
                    rate = str(
                        loan.xpath(
                            "div[@class='invest_box_Info']/div[@class='prize']/font/b/text()"
                        )[0])
                    text = loan.xpath(
                        "div[@class='invest_box_Info']/div[@class='text']")
                    loan_period = ""
                    repayment = ""
                    for lp in text:
                        p = lxml.html.tostring(lp).strip().replace(
                            "\r\n", "").split("<br>")
                        html_parser = HTMLParser.HTMLParser()
                        loan_period = html_parser.unescape(p[0].replace(
                            '<div class="text">',
                            "").strip()).encode("UTF-8").replace("借款期限:", "")
                        repayment = html_parser.unescape(
                            p[1].strip()).encode("UTF-8").replace("还款方式:", "")

                    cast = loan.xpath("div[@class='invest_box_Info']/div[@class='text2']/text()")[0].strip()\
                        .encode("UTF-8").replace("已投:¥", "").replace("元","")
                    schedule = str(
                        loan.xpath(
                            "div[@class='invest_box_Info']/div[@class='percent_big']/div[@class='percent_small']/font/text()"
                        )[0])

                    logger.info(href, title, borrow_amount, rate, cast,
                                schedule, loan_period, repayment)

                    db = get_db_engine()
                    db.execute(
                        "insert into loan (company_id,url,title,borrow_amount,rate,loan_period,"
                        "repayment,cast,schedule,crawl_status,status,create_time,update_time) "
                        "values (1,%s,%s,%s,%s,%s,%s,%s,%s,0,0,now(),now())",
                        href, title, borrow_amount, rate, loan_period,
                        repayment, cast, schedule)

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #17
0
def crawl():
    company_id = 26
    url = "http://www.htyd50.com/trade/borrow/bidding.htm"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath(
            "//div[@class='page_block']/div[@class='page_block_content']/div[@class='min_height_300 mb_30']/div[@class='w980 clearfix']"
        )
        print len(loans)
        if len(loans) > 0:
            for loan in loans:
                href = str(
                    loan.xpath("div[2]/div[1]/div[1]/a/@href")[0].encode(
                        "utf-8"))
                original_id = href.replace(".html", "").split("/")[5].strip()
                print href, original_id
        #        if original_id:
        #            online_ids_set.add(original_id)
        #
        #        if original_id in db_ids_set:
        #            update_ids_set.add(original_id)
        #
        #            loan_obj = Loan(company_id, original_id)
        #            if loan.xpath("td[7]/div/a"):
        #                loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
        #            else:
        #                loan_obj.schedule = "0"
        #            loan_obj.db_update(db)
        #        else:
        #            new_ids_set.add(original_id)
        #
        #            loan_obj = Loan(company_id, original_id)
        #            loan_obj.href = "https://www.xinhehui.com" + href
        #            title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip()
        #            if loan.xpath("td[1]/p[1]/a/em"):
        #                title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip()
        #            else:
        #                title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip()
        #            loan_obj.title = title_1 + title_2
        #            borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "")
        #            if borrow_amount.find("万") > 0:
        #                loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000
        #            else:
        #                loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", ""))
        #
        #            if loan.xpath("td[4]/span"):
        #                period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip()
        #            else:
        #                period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip()
        #            if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
        #                loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
        #                loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
        #            else:
        #                loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
        #                loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
        #
        #            loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "")
        #            loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
        #            if loan.xpath("td[7]/div/a"):
        #                loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
        #            else:
        #                loan_obj.schedule = "0"
        #
        #            loan_obj.db_create(db)
        #
        #logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))
        #
        ## db - 新抓取的 = 就是要下线的
        #off_ids_set = db_ids_set - online_ids_set
        #if off_ids_set:
        #    loan_obj = Loan(company_id)
        #    loan_obj.db_offline(db, off_ids_set)
        #    logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #18
0
def crawl():
    company_id = 20
    url = "http://www.xiaomabank.com/finance.do"
    request_headers = {"Referee": "http://www.xiaomabank.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = (
                        str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8"))
                        .replace("width:", "")
                        .strip()
                        .replace("%;", "")
                    )
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.xiaomabank.com/" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = (
                        str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "")
                    )
                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = (
                        str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8"))
                        .replace("width:", "")
                        .strip()
                        .replace("%;", "")
                    )

                    # 注意这里页面返回的gzip压缩后的,需要解压
                    resp = urllib2.urlopen(loan_obj.href)
                    respInfo = resp.info()
                    if ("Content-Encoding" in respInfo) and (respInfo["Content-Encoding"] == "gzip"):
                        respHtml = zlib.decompress(resp.read(), 16 + zlib.MAX_WBITS)
                        info_htm_parse = parse_html(respHtml, encoding="utf-8")
                        loan_obj.repayment = str(
                            info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8")
                        )

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #19
0
def crawl():
    company_id = 8
    url = "http://www.eloancn.com/new/loadAllTender.action?page=3&sidx=progress&sord=desc"
    request_headers = {'Referee': "http://www.eloancn.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        for p in range(1, 4):
            url = "http://www.eloancn.com/new/loadAllTender.action?page=%s" % p
            logger.info("page url:%s", url)
            # 这个页面比较恶心,一个标的的属性不在一个div内
            loan_htm = download_page(url, request_headers)
            loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
            htm_1 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd300 pdl10 fl']")
            htm_2 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd140 fl']")
            htm_3 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd130 fl pdl10']")
            htm_4 = loan_htm_parse.xpath("//div[@class='lendtable']/dl/dd[@class='wd130 fl']")

            loan_list = []
            for h1 in htm_1:
                loan_obj = Loan(company_id)
                loan_obj.title = str(h1.xpath("h3/a[@class='fl']/text()")[0].encode("utf-8"))
                loan_obj.href = str(h1.xpath("h3/a[@class='fl']/@href")[0]).replace(":80", "")
                loan_obj.original_id = loan_obj.href.split("=")[1]
                loan_list.append(loan_obj)
            for index, h2 in enumerate(htm_2):
                loan_list[index].borrow_amount = str(h2.xpath("p[@class='colorCb mt10']/text()")[0].encode("utf-8")).replace("¥","").replace(",","")
                loan_list[index].rate = str(h2.xpath("p[@class='colorE6']/span/text()")[0]).replace("%", "")
            for index, h3 in enumerate(htm_3):
                loan_list[index].period = str(h3.xpath("p/span/text()")[0].encode("utf-8"))
                loan_list[index].period_unit = loan_obj.PERIOD_UNIT_MONTH
                loan_list[index].repayment = str(h3.xpath("p[@class='']/text()")[0].encode("utf-8"))
            for index, h4 in enumerate(htm_4):
                loan_list[index].schedule = str(h4.xpath("p/span/em/text()")[0]).strip().replace("%", "")

            # 去掉已经满标的
            new_list = [i for i in loan_list if i.schedule != "100"]

            for loan in new_list:
                online_ids_set.add(loan.original_id)
                if loan.original_id in db_ids_set:
                    update_ids_set.add(loan.original_id)

                    loan.db_update(db)
                else:
                    new_ids_set.add(loan.original_id)

                    loan.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

            time.sleep(5)

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj_off = Loan(company_id)
            loan_obj_off.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))


    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #20
0
def crawl():
    company_id = 13
    url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress="
    request_headers = {'Referee': "http://www.yirendai.com/loan/list/1", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//ul[@class='bidList']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath("div[2]/div[2]/div/div[@class='bid_empty_errortip']"):
                    continue
                href = str(loan.xpath("div[2]/div/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.yirendai.com" + href
                    loan_obj.title = str(loan.xpath("div[2]/div/h3/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.rate = str(loan.xpath("div[2]/div/div[3]/h4/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.period = str(loan.xpath("div[2]/div/div[4]/h4/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #21
0
def crawl():
    company_id = 3
    url = "http://www.91wangcai.com/invest/index.html"
    request_headers = {
        'Referee': "http://www.91wangcai.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="gb2312")
        loans = loan_htm_parse.xpath("//div[@class='proBoxNew']")

        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='hd']/a/@href")[0])
                original_id = href.split(".")[0].split("/")[2].encode("utf-8")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.91wangcai.com" + href
                    loan_obj.title = autodecode(
                        str(
                            loan.xpath("div[@class='hd']/a/text()")[0].encode(
                                "gb2312"))).encode("utf-8")

                    loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("¥", "")

                    loan_obj.rate = str(
                        loan.xpath(
                            "div[@class='bd']/table/tr[1]/td[2]/em/text()")
                        [0]).strip().replace("%", "")

                    loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \
                        .replace("<em>", "").replace("</em>", "")
                    html_parser = HTMLParser.HTMLParser()
                    period = html_parser.unescape(loan_period_text).encode(
                        "utf-8").strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("还款方式:", "")

                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def get_taobao_shop_favorite_count(the_shop, shop_html_obj, urls):
    """获取淘宝店被收藏数目"""
    try:
        favorite_count_success = False

        favorite_param = shop_html_obj.xpath(u"//div[@class='item collect-num']/span[contains(@data-info,'SCCP')]/@data-info")
        if favorite_param:
            the_param = favorite_param[0].split('&')
            favorite_url = "%s?callback=jsonp%d&t=%s&keys=%s" % (
                the_param[1].split('=')[1], random.randint(1000, 9999), str(int(time.time() * 1000)),
                the_param[0].split('=')[1]
            )
            urls['favorite_url'] = favorite_url
            favorite_html = download_with_referer(favorite_url, urls['shop_rate_url'])
            if favorite_html:
                logger.debug("download shop favorite html. shop_id: %d, url: %s. html length: %d." % (
                    the_shop.get_shop_id(), favorite_url, len(favorite_html))
                )
                the_shop.favorited_user_count = int(favorite_num_reg.search(favorite_html).group(1))
                favorite_count_success = True
            else:
                logger.error(
                    "download shop favorite html error. shop_id: %d, url: %s." % (the_shop.get_shop_id(), favorite_url)
                )

        if not favorite_count_success:
            logger.debug("use pattern left edition to get favorite data ")
            favorite_param = shop_html_obj.xpath(u"//li[@id='J_SCollCount'][@data-info]/@data-info")
            if favorite_param:
                the_param = favorite_param[0].split('&')
                favorite_url = "%s?t=%s&keys=%s&callback=setShopStat" % (
                    the_param[1].split('=')[1], str(int(time.time() * 1000)),
                    the_param[0].split('=')[1]
                )
                favorite_html = download_with_referer(favorite_url, urls['shop_rate_url'])
                if favorite_html:
                    the_shop.favorited_user_count = int(favorite_num_reg.search(favorite_html).group(1))
                    favorite_count_success = True

        if not favorite_count_success:
            logger.debug("use pattern for old edition to get favorite data ")

            shop_description_url = shop_html_obj.xpath(u"//a[@title='店铺介绍']/@href")
            if shop_description_url:
                shop_description_html = download_with_referer(shop_description_url[0], urls['shop_rate_url'])
                if shop_description_html:
                    shop_description_html_obj = parse_html(shop_description_html, 'gbk')
                    favorite_param = shop_description_html_obj.xpath(u"//li[@id='J_SCollCount'][@data-info]/@data-info")
                    if favorite_param:
                        the_param = favorite_param[0].split('&')
                        favorite_url = "%s?t=%s&keys=%s&callback=setShopStat" % (
                            the_param[1].split('=')[1], str(int(time.time() * 1000)),
                            the_param[0].split('=')[1]
                        )
                        favorite_html = download_with_referer(favorite_url, shop_description_url)
                        if favorite_html:
                            the_shop.favorited_user_count = int(favorite_num_reg.search(favorite_html).group(1))
                            favorite_count_success = True



        if not favorite_count_success:
            logger.error("get shop favorite count failed. shop_id: %d." % the_shop.get_shop_id())
    except:
        logger.error("get shop favorite count failed. shop_id: %s. error info: %s" % (the_shop.get_shop_id(),  traceback.format_exc()))
Beispiel #23
0
def crawler(sql):
    db = get_db_engine()
    shops = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for shop in shops:
        shop_id = shop[0]
        url = str(shop[1])
        type = shop[2]
        if url[-1] != '/':
            url += "/"
        try:
            shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
            dongtai_url = url + "dongtai.htm"
            dongtai_data = download(dongtai_url, shop_headers)
            if dongtai_data:
                dongtai_obj = parse_html(dongtai_data, encoding="gb18030")
                dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode(
                    'utf-8')
                if '店铺动态' in dongtai_title:
                    microscope_data = dongtai_obj.xpath(
                        "//*[@name='microscope-data']/@content")
                    userId = get_val(str(microscope_data), "userId")

                    if userId:
                        dongtai_headers = {
                            'Referer': dongtai_url,
                            'User-Agent': DEFAULT_UA
                        }
                        promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \
                                        "&userId=%s&vfeedTabId=115" % userId
                        promotion_data = download(promotion_url,
                                                  dongtai_headers)

                        if promotion_data:
                            promotion_obj = parse_html(promotion_data,
                                                       encoding="gb18030")
                            i = 0
                            while i < 10:
                                feedInfo = promotion_obj.xpath(
                                    "//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()"
                                )[i].encode('utf-8')
                                if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo:
                                    #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i]
                                    link = promotion_obj.xpath(
                                        "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href"
                                    )[i]
                                    promotion_price = promotion_obj.xpath(
                                        "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()"
                                    )[i]
                                    price = promotion_obj.xpath(
                                        "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()"
                                    )[i]
                                    promotion_time = promotion_obj.xpath(
                                        u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()"
                                    )[i]
                                    pt = promotion_time.encode(
                                        'utf-8').replace("起止日期:",
                                                         "").split(" - ")
                                    start_time = pt[0].replace(".", "-")
                                    end_time = pt[1].replace(".", "-")
                                    if '2013' not in pt[1] or '2014' not in pt[
                                            1]:
                                        end_time = '2013-' + end_time

                                    if start_time > end_time:
                                        end_time = end_time.replace(
                                            "2013", "2014")

                                    num_id = get_numiid(link, dongtai_headers)
                                    if num_id:
                                        sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % (
                                            shop_id, num_id)
                                        re = list(db.execute(sql))
                                        if not re:
                                            db.execute(
                                                "insert into shop_promotion (shop_id, num_id, price, "
                                                "promotion_price, start_time, end_time, create_time, "
                                                "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())"
                                                % (shop_id, num_id,
                                                   price.replace(',', ''),
                                                   promotion_price.replace(
                                                       ',', ''), start_time,
                                                   end_time))
                                    else:
                                        logger.error(
                                            "shop %s:%s crawler num_id failed",
                                            shop_id, url)

                                i += 1
                                logger.info(
                                    "shop %s:%s crawler promotiom item num=%s",
                                    shop_id, url, i)

                        else:
                            logger.warning("shop %s:%s not promotion info",
                                           shop_id, url)
                    else:
                        logger.error("shop %s:%s crawler userId failed",
                                     shop_id, url)
                else:
                    logger.error("shop %s:%s not dongtai page", shop_id, url)
        except:
            logger.error("shop %s:%s crawler failed %s", shop_id, url,
                         traceback.format_exc())
Beispiel #24
0
def crawl():
    company_id = 10
    url = "https://www.xinhehui.com/Financing/Invest/ajaxplist"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//table[@class='ui-record-table percentTable mt10']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if loan.xpath("td[last()]/a/@href")[0].encode("utf-8") == "javascript:;":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8"))
                original_id = href.split("id%3D")[1].encode("utf-8").strip()
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.xinhehui.com" + href
                    title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip()
                    if loan.xpath("td[1]/p[1]/a/em"):
                        title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip()
                    else:
                        title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.title = title_1 + title_2
                    borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "")
                    if borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000
                    else:
                        loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", ""))

                    if loan.xpath("td[4]/span"):
                        period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip()
                    else:
                        period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "")
                    loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #25
0
def crawl():
    company_id = 13
    url = "http://www.yirendai.com/loan/list/1?period=12&currRate=&amt=&progress="
    request_headers = {
        'Referee': "http://www.yirendai.com/loan/list/1",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//ul[@class='bidList']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath(
                        "div[2]/div[2]/div/div[@class='bid_empty_errortip']"):
                    continue
                href = str(loan.xpath("div[2]/div/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(
                        float(loan_obj.cast) / float(loan_obj.borrow_amount) *
                        100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.yirendai.com" + href
                    loan_obj.title = str(
                        loan.xpath("div[2]/div/h3/a/text()")[0].encode(
                            "utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[2]/div/div[2]/h4/span/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    loan_obj.rate = str(
                        loan.xpath("div[2]/div/div[3]/h4/span/text()")
                        [0].encode("utf-8")).strip()
                    loan_obj.period = str(
                        loan.xpath("div[2]/div/div[4]/h4/span/text()")
                        [0].encode("utf-8")).strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.cast = str(loan.xpath("div[2]/div/div[1]/p/text()")[0].encode("utf-8")).strip()\
                        .replace("元", "").split("已投")[1]
                    loan_obj.schedule = str(
                        float(loan_obj.cast) / float(loan_obj.borrow_amount) *
                        100)

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #26
0
def crawl():
    company_id = 20
    url = "http://www.xiaomabank.com/finance.do"
    request_headers = {'Referee': "http://www.xiaomabank.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='pil_main']/table[@class='pil_table']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.xiaomabank.com/" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[5]/strong/text()")[0].encode("utf-8")).strip().replace(",", "")
                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).replace("个月", "").strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("td[6]/div[1]/div/@style")[0].encode("utf-8")).replace("width:", "").strip().replace("%;", "")

                    # 注意这里页面返回的gzip压缩后的,需要解压
                    resp = urllib2.urlopen(loan_obj.href)
                    respInfo = resp.info()
                    if(("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")):
                        respHtml = zlib.decompress(resp.read(), 16+zlib.MAX_WBITS)
                        info_htm_parse = parse_html(respHtml, encoding="utf-8")
                        loan_obj.repayment = str(info_htm_parse.xpath("//div[@id='pi_lt_bottom']/div[1]/div[1]/a/text()")[0].encode("utf-8"))

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #27
0
def crawl():
    company_id = 15
    url = "https://www.iqianbang.com/invest"
    request_headers = {"Referee": "https://www.iqianbang.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("td[7]/text()")[0].encode("utf-8")).strip() != "融资中":
                    continue
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("-")[3].replace(".shtml", "")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.iqianbang.com" + href
                    loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = (
                        str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip().replace(",", "").replace("元", "")
                    )
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = (
                        str(loan.xpath("td[2]/span/span/text()")[0].encode("utf-8")).strip().replace("%", "")
                    )
                    period = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")

                    # 这里需要进入详情页
                    loan_info_htm = download_page(loan_obj.href, headers={"Referee": url, "User-Agent": DEFAULT_UA})
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="UTF-8")
                    loan_obj.repayment = str(
                        loan_info_htm_parse.xpath("//div[@class='inright']/table[@class='idetable']")[0]
                        .xpath("tr[2]/td[2]/span/text()")[0]
                        .encode("utf-8")
                    ).strip()

                    loan_obj.db_create(db)

            logger.info(
                "company %s crawler loan: new size %s, update size %s",
                company_id,
                len(new_ids_set),
                len(update_ids_set),
            )

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #28
0
def crawl():
    company_id = 7
    url = "http://www.jimubox.com/Project/List?status=1"
    request_headers = {
        'Referee': "http://www.jimubox.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath(
            "//div[@class='row']/div[@class='span3 project-card']")
        if len(loans) > 0:
            for loan in loans:
                href = str(
                    loan.xpath(
                        "div[@class='project-item']/div[@class='project-item-content']/h4/a/@href"
                    )[0])
                if not href.find("Index") > 0:
                    continue
                original_id = href.split("/")[3].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.jimubox.com" + href
                    loan_obj.title = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/h4/a/text()"
                        )[0].encode("utf-8"))
                    loan_obj.description = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()"
                        )[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\
                        .strip() + "0000"
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")

                    rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    if rate.find("+") > 0:
                        rate_list = rate.split("+")
                        loan_obj.rate = str(
                            float(rate_list[0]) + float(rate_list[1]))
                    else:
                        loan_obj.rate = rate
                    loan_obj.repayment = str(
                        loan.xpath(
                            "div[@class='project-item']/div[@class='project-item-content']/h6/span/text()"
                        )[0].encode("utf-8"))
                    loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #29
0
def crawl():
    company_id = 16
    # url = "http://www.itouzi.com/dinvest/invest/index"
    url = "http://www.itouzi.com/dinvest/debt/index"
    request_headers = {"Referee": "http://www.itouzi.com", "User-Agent": DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        # 注意ul的class后面有个空格
        loans = loan_htm_parse.xpath("//ul[@class='invest-product-case-list mtn btn clearfix ']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath("div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']"):
                    continue
                href = str(loan.xpath("h2/a[@class='fl']/@href")[0])
                original_id = href.split("id=")[1]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    # loan_obj = Loan(company_id, original_id)
                    # loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    # loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.itouzi.com" + href
                    loan_obj.title = str(loan.xpath("h2/a[@class='fl']/text()")[0].encode("utf-8")).strip()
                    loan_obj.repayment = (
                        str(loan.xpath("p/span[2]/text()")[0].encode("utf-8")).strip().replace("还款方式:", "")
                    )
                    loan_obj.borrow_amount = int(loan.xpath("p/span[3]/strong/text()")[0]) * 10000

                    loan_obj.rate = (
                        str(loan.xpath("p/span[5]/em[1]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    )
                    period = str(loan.xpath("p/span[4]/strong/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    # 这个进度这块还不确定,需等有标时检查一遍
                    if loan.xpath("div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']"):
                        loan_obj.schedule = (
                            str(
                                loan.xpath(
                                    "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()"
                                )[0].encode("utf-8")
                            )
                            .strip()
                            .replace("%", "")
                        )
                        print loan_obj.schedule
                    # loan_obj.db_create(db)
        #
        #    logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))
        #
        ## db - 新抓取的 = 就是要下线的
        # off_ids_set = db_ids_set - online_ids_set
        # if off_ids_set:
        #    loan_obj = Loan(company_id)
        #    loan_obj.db_offline(db, off_ids_set)
        #    logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #30
0
    def crawl_title(self):
        try:
            self.data = self.crawl_page(self.url)
            if not self.data:
                logger.warn(
                    "download %s %s page failed, possible network connection failure",
                    self.item_id, self.num_id)
                return

            # check tmall
            if not self.is_tmall and len(self.data) < 256 and self.url.find(
                    'item.taobao.com'
            ) > 0 and self.data.find(
                    "window.location.href='http://detail.tmall.com/item.htm'+window.location.search"
            ) > 0:
                self.data = self.crawl_page(
                    self.url.replace('item.taobao.com', 'detail.tmall.com'))

            if self.check_offline():
                self.is_offline = True

            self.html_obj = parse_html(self.data, encoding="gb18030")

            title = self.html_obj.xpath("//html/head/title/text()")
            if title and title[0].find(u"转卖") > 0:
                self.is_offline = True
            if title:
                self.title = title[0].encode('utf8').replace(
                    "-淘宝网", "").replace("-tmall.com天猫", "")

            #tblogo = self.html_obj.xpath("//*[@id='shop-logo']")
            tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']")
            if not tmalllogo:
                tmalllogo = self.html_obj.xpath("//*[@id='simple-logo']")
            if not self.is_tmall and tmalllogo:
                self.is_tmall = True

            self.thumbImages = self.html_obj.xpath(
                "//ul[@id='J_UlThumb']//img/@src")
            if not len(self.thumbImages):
                try:
                    # try load thumb images for tmall page
                    self.thumbImages = [
                        IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in
                        self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")
                    ]

                    # taobao @src to @data-src
                    if not len(self.thumbImages):
                        self.thumbImages = self.html_obj.xpath(
                            "//ul[@id='J_UlThumb']//img/@data-src")
                except:
                    logger.warn("No thumbs found %s", self.item_id)
            if self.is_tmall:
                self.cid = get_val(self.data, "categoryId").split('&')[0]
            else:
                self.cid = get_val(self.data, "cid")

            logger.info("Got %s %s html success", self.item_id, self.num_id)
        except:
            logger.error("crawling %s %s unknown exception %s",
                         self.item_id,
                         self.num_id,
                         traceback.format_exc(),
                         extra={'tags': [
                             'crawlItemException',
                         ]})
            raise
def crawl_item2(kwargs):
    #signal.signal(signal.SIGINT, signal.SIG_IGN)
    item = kwargs['item']
    is_commit = kwargs['is_commit']
    crawl_path = kwargs['crawl_path']
    server_path = kwargs['server_path']
    org_server_path = kwargs['org_server_path']
    is_remove = kwargs['is_remove']

    item_id = item[0]
    num_id = item[1]
    is_success = False
    crawl_result = ((item_id, {'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0}),)
    try:
        conn = get_db_engine(**kwargs).connect()
        try:
            items = conn.execute("select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id)
            result = list(items)
            if len(result) == 1:
                html = result[0][0]
                desc_content = result[0][1] 
                html_obj = parse_html(html)
                thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
                if len(thumbImages) == 0:
                    thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")]
                    # taobao @src to @data-src
                    if not len(thumbImages):
                        thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src")

                if len(thumbImages) == 0:
                    logger.error("crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags':['crawl_thumb_empty',]})
                    return crawl_result

                r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M|re.S)
                tr = re.compile("(.*)_\d+x\d+\.jpg$")
                tr_new = re.compile("(.+\.(jpg|png|gif))[^$]*.jpg$")
                desc_thumbs = desc_table_thumbs = lazy_desc_thumbs = []
                if desc_content:
                    desc_html = r.subn(r'\2', desc_content)[0]
                    desc_html_obj = parse_html(desc_html)
                    if desc_html_obj is not None:
                        desc_table_thumbs = desc_html_obj.xpath("//table/@background")
                        desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img[not(@data-ks-lazyload)]/@src")
                        lazy_desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img/@data-ks-lazyload")
                else:
                    logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags':['crawl_nodesc',]})

                images = []
                pos = 1
                for url in thumbImages:
                    ori_url = None
                    if tr.match(url):
                        ori_url = tr.sub(r'\1', url)
                    else:
                        if tr_new.match(url):
                            ori_url = tr_new.sub(r'\1', url)
                        else:
                            logger.error("crawl item %s %s thumb image urls can not be parsed!", item_id, num_id, extra={'tags':['crawl_exception',]})

                    images.append((ori_url, pos, 1))
                    pos += 1
                for url in desc_table_thumbs:
                    images.append((url, pos, 2))
                    pos += 1
                for url in desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 2))
                        pos += 1
                for url in lazy_desc_thumbs:
                    if "js/ckeditor" not in url:
                        images.append((url, pos, 3))
                        pos += 1

                logger.debug("crawling %s %s %s", item_id, num_id, images)
                item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport'])
                item_crawler.crawl(images, ((710,10000),), is_commit, conn, is_remove)
                is_success = item_crawler.success
                crawl_result = ((item_id, item_crawler.summary),)
        except Exception, e:
            logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags':['crawl_exception',]})
        finally:
            conn.close()
        Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'],
            host = kwargs['statshost'], port = kwargs['statsport'])
        if is_success:
            logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result)
            Statsd.increment('guang.crawl.itemimg.succ', host = kwargs['statshost'], port = kwargs['statsport'])
        else:
            logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags':['crawl_failed',]})
            Statsd.increment('guang.crawl.itemimg.failed', host = kwargs['statshost'], port = kwargs['statsport'])
Beispiel #32
0
def crawl():
    company_id = 12
    url = "http://www.renrendai.com/lend/loanList.action"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans_script = htm_obj.xpath("//script[@id='loan-list-rsp']/text()")[0].encode("utf-8")
        loans_json = loads(loans_script, encoding="UTF-8")
        loan_size = len(loans_json["data"]["loans"])
        if loan_size > 0:
            for i in range(0, loan_size):
                if loans_json["data"]["loans"][i]["status"] != "OPEN":
                    #放弃已经结束的
                    continue
                original_id = str(int(loans_json["data"]["loans"][i]["loanId"]))
                href = "http://www.renrendai.com/lend/detailPage.action?loanId=%s" % original_id
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0]
                    loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"]))
                    loan_obj.db_update(db)
                else:
                    pass
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = href
                    loan_obj.title = str(loans_json["data"]["loans"][i]["title"].encode("utf-8"))
                    loan_obj.borrow_amount = str(loans_json["data"]["loans"][i]["amount"])
                    loan_obj.period = str(int(loans_json["data"]["loans"][i]["months"]))
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.rate = str(loans_json["data"]["loans"][i]["interest"])
                    loan_obj.cast = str(float(loans_json["data"]["loans"][i]["amount"]) - float(loans_json["data"]["loans"][i]["surplusAmount"]))
                    loan_obj.schedule = str(int(loans_json["data"]["loans"][i]["finishedRatio"])).split(".")[0]
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #33
0
def crawl():
    company_id = 23
    url = "https://member.niwodai.com/xiangmu/"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm, encoding="utf-8")
        loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\
            .replace("共", "").replace("个标", "").strip())
        if loan_size > 0:
            page = loan_size / 10
            if loan_size % 10 > 0:
                page += 1
            for p in range(1, page + 1):
                page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % (
                    p, loan_size)
                page_html = download_page(page_url, request_headers)
                page_obj = parse_html(page_html, encoding="utf-8")
                loans = page_obj.xpath(
                    "//div[@class='biaoList']/table/tbody/tr")
                for loan in loans:
                    if lxml.html.tostring(loan).find("<th>") > 0:
                        continue
                    href = str(loan.xpath("td[1]/a/@href")[0])
                    original_id = href.replace(".html", "").split("/")[2]
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(
                            loan.xpath("td[5]/text()")[0].encode(
                                "utf-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = REFEREE + href
                        loan_obj.title = str(
                            loan.xpath("td[1]/a/text()")[0].encode(
                                "utf-8")).strip()
                        loan_obj.borrow_amount = str(
                            loan.xpath("td[4]/em/text()")[0].encode(
                                "utf-8")).strip().replace(",", "")
                        loan_obj.rate = str(
                            loan.xpath("td[2]/em/text()")[0].encode(
                                "utf-8")).strip().replace("%", "")
                        loan_obj.period = str(
                            loan.xpath("td[3]/em/text()")[0].encode(
                                "utf-8")).strip()
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        loan_obj.schedule = str(
                            loan.xpath("td[5]/text()")[0].encode(
                                "utf-8")).strip().replace("%", "")

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #34
0
def crawl():
    company_id = 21
    url = "http://www.id68.cn/invest/index/borrow_status/9/p/1.html"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//ul[@class='ideal_con']/li")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]) == "100.00%":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("table/tr[1]/td[1]/a/@href")[0].encode("utf-8"))
                original_id = href.replace(".html", "").split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = REFEREE + href
                    loan_obj.title = str(loan.xpath("table/tr[1]/td[1]/a/@title")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("table/tr[2]/td[last()]/span/text()")[0].encode("utf-8"))\
                        .strip().replace(" ", "").replace(",", "")
                    loan_obj.repayment = str(loan.xpath("table/tr[2]/td[4]/span/text()")[0].encode("utf-8")).strip()
                    loan_obj.rate = str(loan.xpath("table/tr[2]/td[2]/span/text()")[0].encode("utf-8")).strip().replace("%", "")
                    loan_obj.period = str(loan.xpath("table/tr[2]/td[3]/span/text()")[0].encode("utf-8")).strip()\
                        .replace(" ", "").replace("个月", "")
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    loan_obj.schedule = str(loan.xpath("table/tr[last()]/td[2]/div/span/text()")[0]).strip().replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #35
0
def crawl():
    company_id = 8
    url = "http://www.eloancn.com/new/loadAllTender.action?page=3&sidx=progress&sord=desc"
    request_headers = {
        'Referee': "http://www.eloancn.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        for p in range(1, 4):
            url = "http://www.eloancn.com/new/loadAllTender.action?page=%s" % p
            logger.info("page url:%s", url)
            # 这个页面比较恶心,一个标的的属性不在一个div内
            loan_htm = download_page(url, request_headers)
            loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
            htm_1 = loan_htm_parse.xpath(
                "//div[@class='lendtable']/dl/dd[@class='wd300 pdl10 fl']")
            htm_2 = loan_htm_parse.xpath(
                "//div[@class='lendtable']/dl/dd[@class='wd140 fl']")
            htm_3 = loan_htm_parse.xpath(
                "//div[@class='lendtable']/dl/dd[@class='wd130 fl pdl10']")
            htm_4 = loan_htm_parse.xpath(
                "//div[@class='lendtable']/dl/dd[@class='wd130 fl']")

            loan_list = []
            for h1 in htm_1:
                loan_obj = Loan(company_id)
                loan_obj.title = str(
                    h1.xpath("h3/a[@class='fl']/text()")[0].encode("utf-8"))
                loan_obj.href = str(
                    h1.xpath("h3/a[@class='fl']/@href")[0]).replace(":80", "")
                loan_obj.original_id = loan_obj.href.split("=")[1]
                loan_list.append(loan_obj)
            for index, h2 in enumerate(htm_2):
                loan_list[index].borrow_amount = str(
                    h2.xpath("p[@class='colorCb mt10']/text()")[0].encode(
                        "utf-8")).replace("¥", "").replace(",", "")
                loan_list[index].rate = str(
                    h2.xpath("p[@class='colorE6']/span/text()")[0]).replace(
                        "%", "")
            for index, h3 in enumerate(htm_3):
                loan_list[index].period = str(
                    h3.xpath("p/span/text()")[0].encode("utf-8"))
                loan_list[index].period_unit = loan_obj.PERIOD_UNIT_MONTH
                loan_list[index].repayment = str(
                    h3.xpath("p[@class='']/text()")[0].encode("utf-8"))
            for index, h4 in enumerate(htm_4):
                loan_list[index].schedule = str(
                    h4.xpath("p/span/em/text()")[0]).strip().replace("%", "")

            # 去掉已经满标的
            new_list = [i for i in loan_list if i.schedule != "100"]

            for loan in new_list:
                online_ids_set.add(loan.original_id)
                if loan.original_id in db_ids_set:
                    update_ids_set.add(loan.original_id)

                    loan.db_update(db)
                else:
                    new_ids_set.add(loan.original_id)

                    loan.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

            time.sleep(5)

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj_off = Loan(company_id)
            loan_obj_off.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #36
0
def crawl():
    company_id = 3
    url = "http://www.91wangcai.com/invest/index.html"
    request_headers = {'Referee': "http://www.91wangcai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="gb2312")
        loans = loan_htm_parse.xpath("//div[@class='proBoxNew']")

        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='hd']/a/@href")[0])
                original_id = href.split(".")[0].split("/")[2].encode("utf-8")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.91wangcai.com" + href
                    loan_obj.title = autodecode(str(loan.xpath("div[@class='hd']/a/text()")[0].encode("gb2312"))).encode("utf-8")

                    loan_obj.borrow_amount = autodecode(str(loan.xpath("div[@class='bd']/table/tr[1]/td[1]/em/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("¥", "")

                    loan_obj.rate = str(loan.xpath("div[@class='bd']/table/tr[1]/td[2]/em/text()")[0]).strip().replace("%", "")

                    loan_period_text = lxml.html.tostring(loan.xpath("div[@class='bd']/table/tr[1]/td[3]/*")[0]) \
                        .replace("<em>", "").replace("</em>", "")
                    html_parser = HTMLParser.HTMLParser()
                    period = html_parser.unescape(loan_period_text).encode("utf-8").strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.repayment = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[1]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("还款方式:", "")

                    loan_obj.schedule = autodecode(str(loan.xpath("div[@class='bd']/table/tr[2]/td[2]/text()")[0].encode("gb2312"))) \
                        .encode("utf-8").replace("融资进度:", "").replace("借款进度:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #37
0
def crawl():
    company_id = 26
    url = "http://www.htyd50.com/trade/borrow/bidding.htm"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//div[@class='page_block']/div[@class='page_block_content']/div[@class='min_height_300 mb_30']/div[@class='w980 clearfix']")
        print len(loans)
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[2]/div[1]/div[1]/a/@href")[0].encode("utf-8"))
                original_id = href.replace(".html", "").split("/")[5].strip()
                print href, original_id
        #        if original_id:
        #            online_ids_set.add(original_id)
        #
        #        if original_id in db_ids_set:
        #            update_ids_set.add(original_id)
        #
        #            loan_obj = Loan(company_id, original_id)
        #            if loan.xpath("td[7]/div/a"):
        #                loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
        #            else:
        #                loan_obj.schedule = "0"
        #            loan_obj.db_update(db)
        #        else:
        #            new_ids_set.add(original_id)
        #
        #            loan_obj = Loan(company_id, original_id)
        #            loan_obj.href = "https://www.xinhehui.com" + href
        #            title_1 = str(loan.xpath("td[1]/p[1]/a/text()")[0].encode("utf-8")).strip()
        #            if loan.xpath("td[1]/p[1]/a/em"):
        #                title_2 = str(loan.xpath("td[1]/p[1]/a/em/text()")[0].encode("utf-8")).strip()
        #            else:
        #                title_2 = str(loan.xpath("td[1]/p[1]/a/span/text()")[0].encode("utf-8")).strip()
        #            loan_obj.title = title_1 + title_2
        #            borrow_amount = str(loan.xpath("td[2]/span/text()")[0].encode("utf-8")).strip().replace(" ", "")
        #            if borrow_amount.find("万") > 0:
        #                loan_obj.borrow_amount = float(borrow_amount.replace("万", "")) * 10000
        #            else:
        #                loan_obj.borrow_amount = float(borrow_amount.replace("元", "").replace(",", ""))
        #
        #            if loan.xpath("td[4]/span"):
        #                period = str(loan.xpath("td[4]/span/@title")[0].encode("UTF-8")).strip()
        #            else:
        #                period = str(loan.xpath("td[4]/text()")[0].encode("UTF-8")).strip()
        #            if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
        #                loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
        #                loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
        #            else:
        #                loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
        #                loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
        #
        #            loan_obj.rate = str(loan.xpath("td[3]/p/text()")[0]).strip().replace("%", "")
        #            loan_obj.repayment = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
        #            if loan.xpath("td[7]/div/a"):
        #                loan_obj.schedule = str(loan.xpath("td[7]/div/a/text()")[0].encode("UTF-8")).strip().replace("%", "")
        #            else:
        #                loan_obj.schedule = "0"
        #
        #            loan_obj.db_create(db)
        #
        #logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))
        #
        ## db - 新抓取的 = 就是要下线的
        #off_ids_set = db_ids_set - online_ids_set
        #if off_ids_set:
        #    loan_obj = Loan(company_id)
        #    loan_obj.db_offline(db, off_ids_set)
        #    logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #38
0
def crawl():
    company_id = 7
    url = "http://www.jimubox.com/Project/List?status=1"
    request_headers = {'Referee': "http://www.jimubox.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='row']/div[@class='span3 project-card']")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/@href")[0])
                if not href.find("Index") > 0:
                    continue
                original_id = href.split("/")[3].encode("utf-8")

                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.jimubox.com" + href
                    loan_obj.title = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h4/a/text()")[0].encode("utf-8"))
                    loan_obj.description = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-detail']/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-sum-money']/text()")[0].encode("utf-8"))\
                        .strip() + "0000"
                    loan_obj.cast = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/p[@class='project-info']/span[@class='project-current-money']/text()")[0].encode("utf-8"))\
                        .strip().replace("/", "").replace(",", "")

                    rate = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-left']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    if rate.find("+") > 0:
                        rate_list = rate.split("+")
                        loan_obj.rate = str(float(rate_list[0]) + float(rate_list[1]))
                    else:
                        loan_obj.rate = rate
                    loan_obj.repayment = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/h6/span/text()")[0].encode("utf-8"))
                    loan_obj.period = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='project-other']/div[@class='project-other-right']/span/text()")[0].encode("utf-8"))\
                        .strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loan.xpath("div[@class='project-item']/div[@class='project-item-content']/div[@class='progress project-progress']/div/@style")[0])\
                        .replace("width:", "").strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #39
0
def crawl():
    company_id = 23
    url = "https://member.niwodai.com/xiangmu/"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm, encoding="utf-8")
        loan_size = int(str(htm_obj.xpath("//div[@class='biaoList']/table/tbody/tr[1]/th[last()]/text()")[0].encode("utf-8"))\
            .replace("共", "").replace("个标", "").strip())
        if loan_size > 0:
            page = loan_size / 10
            if loan_size % 10 > 0:
                page += 1
            for p in range(1, page+1):
                page_url = "https://member.niwodai.com/loan/loan.do?pageNo=%d&totalCount=%d" % (p, loan_size)
                page_html = download_page(page_url, request_headers)
                page_obj = parse_html(page_html, encoding="utf-8")
                loans = page_obj.xpath("//div[@class='biaoList']/table/tbody/tr")
                for loan in loans:
                    if lxml.html.tostring(loan).find("<th>") > 0:
                        continue
                    href = str(loan.xpath("td[1]/a/@href")[0])
                    original_id = href.replace(".html", "").split("/")[2]
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = REFEREE + href
                        loan_obj.title = str(loan.xpath("td[1]/a/text()")[0].encode("utf-8")).strip()
                        loan_obj.borrow_amount = str(loan.xpath("td[4]/em/text()")[0].encode("utf-8")).strip().replace(",", "")
                        loan_obj.rate = str(loan.xpath("td[2]/em/text()")[0].encode("utf-8")).strip().replace("%", "")
                        loan_obj.period = str(loan.xpath("td[3]/em/text()")[0].encode("utf-8")).strip()
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        loan_obj.schedule = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip().replace("%", "")

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #40
0
def crawl_wzdai():
    url = "https://www.wzdai.com/invest/index.html?status=1&page=1&order=-3"
    request_headers = {'Referee': "https://www.wzdai.com", 'User-Agent': DEFAULT_UA}

    company_id = 3

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        pages_obj = htm_obj.xpath("//div[@class='page']/div[@align='center']/span/text()")[0]
        page = int(str(pages_obj.encode("utf-8")).split("条")[1].split("页")[0])
        for p in range(1, page + 1):
            url = "https://www.wzdai.com/invest/index.html?status=1&page=" + str(p) + "&order=-3"

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath("//div[@class='invest_box']")
            if len(loans) > 0:
                for loan in loans:
                    href = "https://www.wzdai.com" + str(loan.xpath("h1/a[@class='del']/@href")[0])
                    title = loan.xpath("h1/a[@class='del']/text()")[0].strip().encode("UTF-8")
                    borrow_amount = str(loan.xpath("div[@class='invest_box_Info']/div[@class='prize']/span/b/text()")[0])
                    rate = str(loan.xpath("div[@class='invest_box_Info']/div[@class='prize']/font/b/text()")[0])
                    text = loan.xpath("div[@class='invest_box_Info']/div[@class='text']")
                    loan_period = ""
                    repayment = ""
                    for lp in text:
                        p = lxml.html.tostring(lp).strip().replace("\r\n", "").split("<br>")
                        html_parser = HTMLParser.HTMLParser()
                        loan_period = html_parser.unescape(p[0].replace('<div class="text">', "").strip()).encode("UTF-8").replace("借款期限:", "")
                        repayment = html_parser.unescape(p[1].strip()).encode("UTF-8").replace("还款方式:", "")

                    cast = loan.xpath("div[@class='invest_box_Info']/div[@class='text2']/text()")[0].strip()\
                        .encode("UTF-8").replace("已投:¥", "").replace("元","")
                    schedule = str(loan.xpath("div[@class='invest_box_Info']/div[@class='percent_big']/div[@class='percent_small']/font/text()")[0])

                    logger.info(href,title,borrow_amount,rate,cast,schedule,loan_period, repayment)

                    db = get_db_engine()
                    db.execute("insert into loan (company_id,url,title,borrow_amount,rate,loan_period,"
                              "repayment,cast,schedule,crawl_status,status,create_time,update_time) "
                               "values (1,%s,%s,%s,%s,%s,%s,%s,%s,0,0,now(),now())", href, title, borrow_amount,
                               rate,loan_period,repayment,cast,schedule)

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #41
0
def crawl():
    company_id = 14
    url = "http://www.licaifan.com"
    request_headers = {'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//ul[@class='main-list tab-con2']/li[1]/table/tr")
        if len(loans) > 0:
            # 这里注意第一行是表单标题,不需要,所以从1开始
            for i in range(1, len(loans)):
                if str(loans[i].xpath("td[last()]/a/text()")[0].encode("utf-8")) == "投资满额":
                    continue
                href = str(loans[i].xpath("td[1]/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.licaifan.com" + href
                    loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = str(loans[i].xpath("td[2]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    period = str(loans[i].xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #42
0
def get_taobao_shop_favorite_count(the_shop, shop_html_obj, urls):
    """获取淘宝店被收藏数目"""
    try:
        favorite_count_success = False

        favorite_param = shop_html_obj.xpath(
            u"//div[@class='item collect-num']/span[contains(@data-info,'SCCP')]/@data-info"
        )
        if favorite_param:
            the_param = favorite_param[0].split('&')
            favorite_url = "%s?callback=jsonp%d&t=%s&keys=%s" % (
                the_param[1].split('=')[1], random.randint(1000, 9999),
                str(int(time.time() * 1000)), the_param[0].split('=')[1])
            urls['favorite_url'] = favorite_url
            favorite_html = download_with_referer(favorite_url,
                                                  urls['shop_rate_url'])
            if favorite_html:
                logger.debug(
                    "download shop favorite html. shop_id: %d, url: %s. html length: %d."
                    %
                    (the_shop.get_shop_id(), favorite_url, len(favorite_html)))
                the_shop.favorited_user_count = int(
                    favorite_num_reg.search(favorite_html).group(1))
                favorite_count_success = True
            else:
                logger.error(
                    "download shop favorite html error. shop_id: %d, url: %s."
                    % (the_shop.get_shop_id(), favorite_url))

        if not favorite_count_success:
            logger.debug("use pattern left edition to get favorite data ")
            favorite_param = shop_html_obj.xpath(
                u"//li[@id='J_SCollCount'][@data-info]/@data-info")
            if favorite_param:
                the_param = favorite_param[0].split('&')
                favorite_url = "%s?t=%s&keys=%s&callback=setShopStat" % (
                    the_param[1].split('=')[1], str(int(
                        time.time() * 1000)), the_param[0].split('=')[1])
                favorite_html = download_with_referer(favorite_url,
                                                      urls['shop_rate_url'])
                if favorite_html:
                    the_shop.favorited_user_count = int(
                        favorite_num_reg.search(favorite_html).group(1))
                    favorite_count_success = True

        if not favorite_count_success:
            logger.debug("use pattern for old edition to get favorite data ")

            shop_description_url = shop_html_obj.xpath(
                u"//a[@title='店铺介绍']/@href")
            if shop_description_url:
                shop_description_html = download_with_referer(
                    shop_description_url[0], urls['shop_rate_url'])
                if shop_description_html:
                    shop_description_html_obj = parse_html(
                        shop_description_html, 'gbk')
                    favorite_param = shop_description_html_obj.xpath(
                        u"//li[@id='J_SCollCount'][@data-info]/@data-info")
                    if favorite_param:
                        the_param = favorite_param[0].split('&')
                        favorite_url = "%s?t=%s&keys=%s&callback=setShopStat" % (
                            the_param[1].split('=')[1],
                            str(int(time.time() * 1000)),
                            the_param[0].split('=')[1])
                        favorite_html = download_with_referer(
                            favorite_url, shop_description_url)
                        if favorite_html:
                            the_shop.favorited_user_count = int(
                                favorite_num_reg.search(favorite_html).group(
                                    1))
                            favorite_count_success = True

        if not favorite_count_success:
            logger.error("get shop favorite count failed. shop_id: %d." %
                         the_shop.get_shop_id())
    except:
        logger.error(
            "get shop favorite count failed. shop_id: %s. error info: %s" %
            (the_shop.get_shop_id(), traceback.format_exc()))
Beispiel #43
0
def crawl():
    company_id = 11
    url = "https://www.tzydb.com"
    request_headers = {'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath("//div[@id='proList']/ul[@class='item_li']")
        if len(loans) > 0:
            for loan in loans:
                schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip()
                if schedule == "100%" or schedule == "100.0%":
                    #放弃已经结束的
                    continue
                # link = https://www.tzydb.com/boot/lookup/971,1017
                a_script = str(loan.xpath("li/div[1]/div[1]/div/a/@href")[0].encode("utf-8"))
                o_id = ID_RE.findall(a_script)[0]
                original_id = o_id.replace(",", "-")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)
                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.tzydb.com/boot/lookup/" + o_id
                    loan_obj.title = str(loan.xpath("li/div[1]/div[1]/div/a/text()")[0].encode("utf-8"))
                    loan_obj.borrow_amount = str(loan.xpath("li/div[2]/div[1]/span/text()")[0].encode("utf-8")).strip()\
                        .replace(" ", "").replace(",", "")
                    loan_obj.period = str(loan.xpath("li/div[2]/div[3]/span/text()")[0].encode("UTF-8")).strip()
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.rate = str(loan.xpath("li/div[2]/div[2]/span/text()")[0]).strip().replace("%", "")
                    loan_obj.schedule = str(loan.xpath("li/div[last()]/div[1]/span[2]/strong/text()")[0].encode("UTF-8")).strip().replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #44
0
def crawl():
    company_id = 26
    url = "http://www.longlongweb.com/invests"
    request_headers = {'Referee': "http://www.longlongweb.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="utf-8")
        loans = loan_htm_parse.xpath("//div[@class='main01']/span/dl")
        if len(loans) > 0:
            for loan in loans:
                if not lxml.html.tostring(loan.xpath("dd/p[1]")[0]).find("href") > 0:
                    continue

                href = str(loan.xpath("dd/table/tr[1]/td[1]/a/@href")[0])
                original_id = href.split("/")[2]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\
                        .replace(",", "").replace("¥", "").strip()

                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0]
                                        .encode("utf-8")).replace(",", "").replace("¥", "").strip()
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.longlongweb.com" + href
                    loan_obj.title = str(loan.xpath("dd/table/tr[1]/td[1]/a/@title")[0].encode("utf-8"))
                    loan_obj.rate = str(loan.xpath("dd/table/tr[2]/td[2]/span/text()")[0].encode("utf-8"))\
                        .replace("%", "")
                    loan_obj.borrow_amount = str(loan.xpath("dd/table/tr[2]/td[1]/span/text()")[0].encode("utf-8"))\
                        .replace(",", "").replace("¥", "").strip()
                    loan_obj.period = str(loan.xpath("dd/table/tr[2]/td[3]/span/text()")[0])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_info_htm = download_page(loan_obj.href, request_headers)
                    loan_info_htm_parse = parse_html(loan_info_htm, encoding="utf-8")
                    loan_obj.cast = str(loan_info_htm_parse.xpath("//div[@class='zrll']/span[1]/text()")[0]
                                        .encode("utf-8")).replace(",", "").replace("¥", "").strip()
                    loan_obj.schedule = str(float(loan_obj.cast) / float(loan_obj.borrow_amount) * 100)
                    loan_obj.repayment = str(loan_info_htm_parse.xpath("//div[@class='enterprise-botton']/span[2]/text()")[0]
                                             .encode("utf-8")).strip().replace("还款方式:", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))
    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
def crawler(sql):
    db = get_db_engine()
    shops = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for shop in shops:
        shop_id = shop[0]
        url = str(shop[1])
        type = shop[2]
        if url[-1] != '/':
            url += "/"
        try:
            shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
            dongtai_url = url + "dongtai.htm"
            dongtai_data = download(dongtai_url, shop_headers)
            if dongtai_data:
                dongtai_obj = parse_html(dongtai_data, encoding="gb18030")
                dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode('utf-8')
                if '店铺动态' in dongtai_title:
                    microscope_data = dongtai_obj.xpath("//*[@name='microscope-data']/@content")
                    userId = get_val(str(microscope_data), "userId")

                    if userId:
                        dongtai_headers = {'Referer': dongtai_url, 'User-Agent': DEFAULT_UA}
                        promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \
                                        "&userId=%s&vfeedTabId=115" % userId
                        promotion_data = download(promotion_url, dongtai_headers)

                        if promotion_data:
                            promotion_obj = parse_html(promotion_data, encoding="gb18030")
                            i = 0
                            while i < 10:
                                feedInfo = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()")[i].encode('utf-8')
                                if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo:
                                    #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i]
                                    link = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href")[i]
                                    promotion_price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()")[i]
                                    price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()")[i]
                                    promotion_time = promotion_obj.xpath(u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()")[i]
                                    pt = promotion_time.encode('utf-8').replace("起止日期:","").split(" - ")
                                    start_time = pt[0].replace(".", "-")
                                    end_time = pt[1].replace(".", "-")
                                    if '2013' not in pt[1] or '2014' not in pt[1]:
                                        end_time = '2013-' + end_time

                                    if start_time > end_time:
                                        end_time = end_time.replace("2013", "2014")

                                    num_id = get_numiid(link, dongtai_headers)
                                    if num_id:
                                        sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % (shop_id, num_id)
                                        re = list(db.execute(sql))
                                        if not re:
                                            db.execute("insert into shop_promotion (shop_id, num_id, price, "
                                                       "promotion_price, start_time, end_time, create_time, "
                                                       "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())"
                                                       % (shop_id, num_id, price.replace(',', ''), promotion_price.replace(',', ''), start_time, end_time))
                                    else:
                                        logger.error("shop %s:%s crawler num_id failed", shop_id, url)

                                i += 1
                                logger.info("shop %s:%s crawler promotiom item num=%s", shop_id, url, i)

                        else:
                            logger.warning("shop %s:%s not promotion info", shop_id, url)
                    else:
                        logger.error("shop %s:%s crawler userId failed", shop_id, url)
                else:
                    logger.error("shop %s:%s not dongtai page", shop_id, url)
        except:
            logger.error("shop %s:%s crawler failed %s", shop_id, url, traceback.format_exc())
Beispiel #46
0
def crawl():
    company_id = 16
    #url = "http://www.itouzi.com/dinvest/invest/index"
    url = "http://www.itouzi.com/dinvest/debt/index"
    request_headers = {
        'Referee': "http://www.itouzi.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        # 注意ul的class后面有个空格
        loans = loan_htm_parse.xpath(
            "//ul[@class='invest-product-case-list mtn btn clearfix ']/li")
        if len(loans) > 0:
            for loan in loans:
                if not loan.xpath(
                        "div[@class='i-p-c-subscription']/ul[@class='i-p-c-s-detail']"
                ):
                    continue
                href = str(loan.xpath("h2/a[@class='fl']/@href")[0])
                original_id = href.split("id=")[1]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    #loan_obj = Loan(company_id, original_id)
                    #loan_obj.schedule = str(loan.xpath("td[6]/text()")[0].encode("utf-8")).strip().replace("%", "")
                    #loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.itouzi.com" + href
                    loan_obj.title = str(
                        loan.xpath("h2/a[@class='fl']/text()")[0].encode(
                            "utf-8")).strip()
                    loan_obj.repayment = str(loan.xpath("p/span[2]/text()")[0].encode("utf-8"))\
                        .strip().replace("还款方式:", "")
                    loan_obj.borrow_amount = int(
                        loan.xpath("p/span[3]/strong/text()")[0]) * 10000

                    loan_obj.rate = str(
                        loan.xpath("p/span[5]/em[1]/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")
                    period = str(
                        loan.xpath("p/span[4]/strong/text()")[0].encode(
                            "utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    # 这个进度这块还不确定,需等有标时检查一遍
                    if loan.xpath(
                            "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']"
                    ):
                        loan_obj.schedule = str(
                            loan.xpath(
                                "div[@class='i-p-c-subscription']/div[@class='i-p-c-s-detail']/span[1]/span[last()]/text()"
                            )[0].encode("utf-8")).strip().replace("%", "")
                        print loan_obj.schedule
                    #loan_obj.db_create(db)
        #
        #    logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))
        #
        ## db - 新抓取的 = 就是要下线的
        #off_ids_set = db_ids_set - online_ids_set
        #if off_ids_set:
        #    loan_obj = Loan(company_id)
        #    loan_obj.db_offline(db, off_ids_set)
        #    logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #47
0
def crawl():
    company_id = 2
    url = "http://www.ppdai.com/lend/12_s1_p1"
    request_headers = {'Referee': "http://www.ppdai.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='fen_ye_nav']/table/tr/td[last()]/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")

        for p in range(1, int(page) + 1):
            url = "http://www.ppdai.com/lend/12_s1_p" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath("//div[@class='lend_nav']/table/tr")
            if len(loans) > 0:
                for loan in loans:
                    if lxml.html.tostring(loan).find("tit_nav") > 0:
                        continue
                    href = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@href")[0])
                    original_id = href.split("/")[2].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1]
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id)
                        loan_obj.original_id = original_id
                        loan_obj.href = "http://www.ppdai.com" + href
                        loan_obj.title = str(loan.xpath("td[1]/ul/li[2]/p[1]/a/@title")[0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(loan.xpath("td[4]/text()")[0]).strip().replace("%", "")
                        period = str(loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip().replace(" ", "")
                        if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                            loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                        else:
                            loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                            loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                        loan_obj.schedule = float(str(loan.xpath("td[last()]/p[1]/text()")[0].encode("UTF-8")).strip().replace(" ", "").replace("%", "").split("完成")[1])

                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #48
0
def crawl():
    company_id = 15
    url = "https://www.iqianbang.com/invest"
    request_headers = {
        'Referee': "https://www.iqianbang.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@class='item']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if str(loan.xpath("td[7]/text()")[0].encode(
                        "utf-8")).strip() != "融资中":
                    continue
                href = str(loan.xpath("td[1]/a/@href")[0])
                original_id = href.split("-")[3].replace(".shtml", "")
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(
                        loan.xpath("td[6]/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.iqianbang.com" + href
                    loan_obj.title = str(
                        loan.xpath("td[1]/a/text()")[0].encode(
                            "utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[3]/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "").replace("元", "")
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(
                            loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = str(
                        loan.xpath("td[2]/span/span/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")
                    period = str(
                        loan.xpath("td[4]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(
                        loan.xpath("td[6]/text()")[0].encode(
                            "utf-8")).strip().replace("%", "")

                    # 这里需要进入详情页
                    loan_info_htm = download_page(loan_obj.href,
                                                  headers={
                                                      'Referee': url,
                                                      'User-Agent': DEFAULT_UA
                                                  })
                    loan_info_htm_parse = parse_html(loan_info_htm,
                                                     encoding="UTF-8")
                    loan_obj.repayment = str(
                        loan_info_htm_parse.xpath(
                            "//div[@class='inright']/table[@class='idetable']")
                        [0].xpath("tr[2]/td[2]/span/text()")[0].encode(
                            "utf-8")).strip()

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s",
                        company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #49
0
def crawl():
    company_id = 18
    url = "https://www.my089.com/Loan/default.aspx?pid=1"
    request_headers = {
        'Referee': "http://www.ppdai.com",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        page = str(htm_obj.xpath("//div[@class='yema rt']/span[@class='z_page']/text()")[0].encode("UTF-8"))\
            .replace("共", "").replace("页", "")
        for p in range(1, int(page) + 1):
            url = "https://www.my089.com/Loan/default.aspx?pid=" + str(p)
            logger.info("page url: %s", url)

            loan_htm = download_page(url, request_headers)
            loan_obj = parse_html(loan_htm)
            loans = loan_obj.xpath(
                "//div[@class='Loan_box']/dl[@class='LoanList']")
            if len(loans) > 0:
                for loan in loans:
                    if str(loan.xpath("dd[last()]/p/span/text()")
                           [0]) == "100%":
                        continue
                    href = str(
                        loan.xpath("dd[2]/div[@class='txt_tou']/a/@href")[0])
                    original_id = href.split("=")[1].encode("utf-8")
                    if original_id:
                        online_ids_set.add(original_id)

                    if original_id in db_ids_set:
                        update_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.schedule = str(
                            loan.xpath("dd[last()]/p/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                        loan_obj.db_update(db)
                    else:
                        new_ids_set.add(original_id)

                        loan_obj = Loan(company_id, original_id)
                        loan_obj.href = "https://www.my089.com/Loan/" + href
                        loan_obj.title = str(
                            loan.xpath("dd[2]/div[@class='txt_tou']/a/@title")
                            [0].encode("UTF-8"))
                        loan_obj.borrow_amount = str(loan.xpath("dd[4]/span/text()")[0].encode("UTF-8")).strip().replace("¥", "")\
                            .replace(",", "")
                        loan_obj.rate = str(
                            loan.xpath("dd[3]/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%/年", "")
                        loan_obj.period = str(
                            loan.xpath("dd[5]/span/text()")[0].encode(
                                "UTF-8")).strip().replace(" ", "")
                        s = str(loan.xpath("dd[5]/text()")[0].encode(
                            "UTF-8")).strip().replace(" ",
                                                      "").replace("个", "")
                        loan_obj.period_unit = s.split("/")[0].strip()
                        loan_obj.repayment = s.split("/")[1].strip()
                        loan_obj.schedule = str(
                            loan.xpath("dd[last()]/p/span/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                        loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #50
0
def crawl():
    company_id = 4
    url = "https://www.yinhu.com/loan/loan_list.bl"
    request_headers = {'Referee': "https://www.yinhu.com", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()
    # offline
    off_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath("//div[@id='loan_list']/table/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                href = str(loan.xpath("td[1]/p/a/@href")[0])
                original_id = href.split("=")[1].encode("utf-8")
                try:
                    loan_status = str(loan.xpath("td[last()]/em/span/text()")[0].encode("utf-8")).strip()
                except:
                    loan_status = str(loan.xpath("td[last()]/a/span/text()")[0].encode("utf-8")).strip()

                if original_id and loan_status != "还款中":
                    online_ids_set.add(original_id)

                if loan_status == "还款中" or loan_status == "满标":
                    if original_id in db_ids_set:
                        off_ids_set.add(original_id)
                    continue

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\
                        .strip().replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.yinhu.com" + href
                    loan_obj.title = str(loan.xpath("td[1]/p/a/text()")[0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loan.xpath("td[4]/text()")[0].encode("utf-8")).strip().replace(",", "")\
                        .replace("元", "")

                    loan_obj.rate = str(loan.xpath("td[3]/text()")[0].encode("utf-8")).strip()
                    period = str(loan.xpath("td[5]/text()")[0].encode("utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.schedule = str(loan.xpath("td[6]/div[@class='bar_bg']/div/span/span/text()")[0].encode("utf-8"))\
                        .strip().replace("%", "")

                    loan_obj.db_create(db)

            logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #51
0
def crawl():
    company_id = 10
    url = "https://www.xinhehui.com/Financing/Invest/ajaxplist"
    request_headers = {'Referee': REFEREE, 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        htm = download_page(url, request_headers)
        htm_obj = parse_html(htm)
        loans = htm_obj.xpath(
            "//table[@class='ui-record-table percentTable mt10']/tbody/tr")
        if len(loans) > 0:
            for loan in loans:
                if loan.xpath("td[last()]/a/@href")[0].encode(
                        "utf-8") == "javascript:;":
                    #放弃已经结束的
                    continue
                href = str(loan.xpath("td[1]/p[1]/a/@href")[0].encode("utf-8"))
                original_id = href.split("id%3D")[1].encode("utf-8").strip()
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(
                            loan.xpath("td[7]/div/a/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.xinhehui.com" + href
                    title_1 = str(
                        loan.xpath("td[1]/p[1]/a/text()")[0].encode(
                            "utf-8")).strip()
                    if loan.xpath("td[1]/p[1]/a/em"):
                        title_2 = str(
                            loan.xpath("td[1]/p[1]/a/em/text()")[0].encode(
                                "utf-8")).strip()
                    else:
                        title_2 = str(
                            loan.xpath("td[1]/p[1]/a/span/text()")[0].encode(
                                "utf-8")).strip()
                    loan_obj.title = title_1 + title_2
                    borrow_amount = str(
                        loan.xpath("td[2]/span/text()")[0].encode(
                            "utf-8")).strip().replace(" ", "")
                    if borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = float(
                            borrow_amount.replace("万", "")) * 10000
                    else:
                        loan_obj.borrow_amount = float(
                            borrow_amount.replace("元", "").replace(",", ""))

                    if loan.xpath("td[4]/span"):
                        period = str(
                            loan.xpath("td[4]/span/@title")[0].encode(
                                "UTF-8")).strip()
                    else:
                        period = str(
                            loan.xpath("td[4]/text()")[0].encode(
                                "UTF-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH

                    loan_obj.rate = str(
                        loan.xpath("td[3]/p/text()")[0]).strip().replace(
                            "%", "")
                    loan_obj.repayment = str(
                        loan.xpath("td[5]/text()")[0].encode("UTF-8")).strip()
                    if loan.xpath("td[7]/div/a"):
                        loan_obj.schedule = str(
                            loan.xpath("td[7]/div/a/text()")[0].encode(
                                "UTF-8")).strip().replace("%", "")
                    else:
                        loan_obj.schedule = "0"

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Beispiel #52
0
def crawl():
    company_id = 14
    url = "http://www.licaifan.com"
    request_headers = {'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loan_htm_parse = parse_html(loan_htm, encoding="UTF-8")
        loans = loan_htm_parse.xpath(
            "//ul[@class='main-list tab-con2']/li[1]/table/tr")
        if len(loans) > 0:
            # 这里注意第一行是表单标题,不需要,所以从1开始
            for i in range(1, len(loans)):
                if str(loans[i].xpath("td[last()]/a/text()")[0].encode(
                        "utf-8")) == "投资满额":
                    continue
                href = str(loans[i].xpath("td[1]/h3/a/@href")[0])
                original_id = href.split("/")[3]
                if original_id:
                    online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "http://www.licaifan.com" + href
                    loan_obj.title = str(loans[i].xpath("td[1]/h3/a/text()")
                                         [0].encode("utf-8")).strip()
                    loan_obj.borrow_amount = str(loans[i].xpath("td[3]/text()")[0].encode("utf-8"))\
                        .strip().replace(",", "")
                    if loan_obj.borrow_amount.find("万") > 0:
                        loan_obj.borrow_amount = int(
                            loan_obj.borrow_amount.replace("万", "")) * 10000
                    loan_obj.rate = str(loans[i].xpath("td[2]/text()")
                                        [0].encode("utf-8")).strip().replace(
                                            "%", "")
                    period = str(loans[i].xpath("td[4]/text()")[0].encode(
                        "utf-8")).strip()
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.schedule = str(loans[i].xpath("td[5]/span/span[2]/text()")[0].encode("utf-8")).strip()\
                        .replace("%", "")

                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())