Esempio n. 1
0
    def crawl_volume(self):
        if self.is_tmall:
            apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/")
            self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
            try:
                self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8'))
            except:
                logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc())
            if self.tmallInitApijson:
                try:
                    self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount']
                except:
                    logger.warn("try to get volume from api failed %s", self.item_id)
            if self.volume < 0:
                try:
                    self.volume = int(get_val(self.tmallInitApi, "sellCount"))
                except:
                    self.volume = 0
                    logger.warn("Can not parse tmall item volume %s", self.item_id)

        else:
            apiItemInfoVal = get_val(self.data, "apiItemInfo")
            if apiItemInfoVal:
                apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/")
                itemInfoData = self.crawl_page(apiItemInfoUrl)
                try:
                    self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    self.confirmVolume = int(get_num_val(itemInfoData, 'confirmGoods'))
                except:
                    self.volume = 0
                    logger.warn("Can not parse taobao item volume %s", self.item_id)
            else:
                self.volume = 0
Esempio n. 2
0
def crawler(sql):
    db = get_db_engine()
    items = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for item in items:
        shop_id = item[0]
        shop_type = item[1]
        item_id = item[2]
        url = item[3]

        try:
            htm = get_item_htm(item_id, url, db)
            if shop_type == 1:
                htm_obj = parse_html(htm, encoding='gb18030')
                discount_url = htm_obj.xpath("//div[@id='promote']/@data-default")
                if discount_url and len(discount_url) > 0:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(discount_url[0], item_headers)
                    if disc_content.strip():
                        disc_obj = parse_html(disc_content, encoding='gb18030')
                        content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip()
                        dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip()
                        st = dates.encode('utf-8').replace("--","—").split("—")
                        start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')
                        end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')

                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                                   shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0])
                        logger.info("taobao shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
                else:
                    logger.warning("taobao shop %s:%s not discount.", shop_id, url)
            elif shop_type == 2:
                d_url = get_val(htm, "initApi")
                if d_url:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(d_url, item_headers)
                    cjson = loads(disc_content.decode('gb18030').encode('utf8'))
                    shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm']
                    if shop_prom:
                        st = int(shop_prom['startTime'])/1000
                        et = int(shop_prom['endTime'])/1000
                        start_time = time.strftime("%Y-%m-%d", time.localtime(st))
                        end_time = time.strftime("%Y-%m-%d", time.localtime(et))
                        content = shop_prom['promPlan'][0]['msg']
                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                            shop_id, content.encode('utf-8'), start_time, end_time, d_url)
                        logger.info("tmall shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
        except:
            logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())
Esempio n. 3
0
 def crawl_tmall_rate_page(self, url, page):
     if url:
         url = CURPAGE_RE.subn(r"\g<1>%s\g<3>" % page, url)[0]
         rate1 = self.crawl_page(url)
         rate1 = "{" + rate1 + "}"
         if rate1:
             jsonobj = loads(rate1.decode('gb18030').encode('utf8'))
             return jsonobj
     return None
Esempio n. 4
0
 def crawl_taobao_rate_page(self, rateListUrlBase, page):
     if rateListUrlBase:
         rateListUrl = rateListUrlBase + '&currentPageNum=%s&rateType=&orderType=feedbackdate&showContent=1&attribute=&callback=jsonp_reviews_list' % page
         rate1 = self.crawl_page(rateListUrl)
         m = JSON_RE.match(rate1)
         if m:
             jsonobj = loads(m.group(1).decode('gb18030').encode('utf8'))
             return jsonobj
     return None
Esempio n. 5
0
 def crawl_tmall_rate_page(self, url, page):
     if url:
         url = CURPAGE_RE.subn(r"\g<1>%s\g<3>" % page, url)[0]
         rate1 = self.crawl_page(url)
         rate1 = "{" + rate1 + "}"
         if rate1:
             jsonobj = loads(rate1.decode('gb18030').encode('utf8'))
             return jsonobj
     return None
Esempio n. 6
0
 def crawl_taobao_rate_page(self, rateListUrlBase, page):
     if rateListUrlBase:
         rateListUrl = rateListUrlBase + '&currentPageNum=%s&rateType=&orderType=feedbackdate&showContent=1&attribute=&callback=jsonp_reviews_list' % page
         rate1 = self.crawl_page(rateListUrl)
         m = JSON_RE.match(rate1)
         if m:
             jsonobj = loads(m.group(1).decode('gb18030').encode('utf8'))
             return jsonobj
     return None
Esempio n. 7
0
def crawl():
    company_id = 19
    url = "https://efinance.cmbchinaucs.com/Handler/ActionPage.aspx?targetAction=GetProjectList_Index"
    headers = {
        'Host': "efinance.cmbchinaucs.com",
        'Connection': "keep-alive",
        'Content-Length': "33",
        'Cache-Control': "max-age=0",
        'Accept': "text/plain, */*",
        'Origin': "https://efinance.cmbchinaucs.com",
        'X-Requested-With': "XMLHttpRequest",
        'User-Agent':
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36",
        'Content-Type': "application/x-www-form-urlencoded",
        'Referer': "https://efinance.cmbchinaucs.com/",
        'Accept-Encoding': "gzip,deflate",
        'Accept-Language': "zh-CN,zh;q=0.8,en;q=0.6",
        'Cookie': "ASP.NET_SessionId=woqbxpemqp3kk4syvfbkxtzw"
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = post(url,
                        data={"targetAction": "GetProjectList_Index"},
                        headers=headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        print loans_json

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Esempio n. 8
0
def crawl():
    company_id = 19
    url = "https://efinance.cmbchinaucs.com/Handler/ActionPage.aspx?targetAction=GetProjectList_Index"
    headers = {'Host': "efinance.cmbchinaucs.com",
                        'Connection': "keep-alive",
                        'Content-Length': "33",
                        'Cache-Control': "max-age=0",
                        'Accept': "text/plain, */*",
                        'Origin': "https://efinance.cmbchinaucs.com",
                        'X-Requested-With': "XMLHttpRequest",
                        'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36",
                        'Content-Type': "application/x-www-form-urlencoded",
                        'Referer': "https://efinance.cmbchinaucs.com/",
                        'Accept-Encoding': "gzip,deflate",
                        'Accept-Language': "zh-CN,zh;q=0.8,en;q=0.6",
                        'Cookie': "ASP.NET_SessionId=woqbxpemqp3kk4syvfbkxtzw"}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = post(url, data={"targetAction": "GetProjectList_Index"}, headers=headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        print loans_json

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Esempio n. 9
0
    def crawl_volume(self):
        if self.is_tmall:
            apiItemInfoUrl = get_val(self.data,
                                     "initApi").replace(r'''\/''', "/")
            self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
            try:
                self.tmallInitApijson = loads(
                    self.tmallInitApi.decode('gb18030').encode('utf8'))
            except:
                logger.info("parse tmall api json failed %s : %s",
                            self.item_id, traceback.format_exc())
            if self.tmallInitApijson:
                try:
                    self.volume = self.tmallInitApijson['defaultModel'][
                        'sellCountDO']['sellCount']
                except:
                    logger.warn("try to get volume from api failed %s",
                                self.item_id)
            if self.volume < 0:
                try:
                    self.volume = int(get_val(self.tmallInitApi, "sellCount"))
                except:
                    self.volume = 0
                    logger.warn("Can not parse tmall item volume %s",
                                self.item_id)

        else:
            apiItemInfoVal = get_val(self.data, "apiItemInfo")
            if apiItemInfoVal:
                apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(
                    r'''\/''', "/")
                itemInfoData = self.crawl_page(apiItemInfoUrl)
                try:
                    self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    self.confirmVolume = int(
                        get_num_val(itemInfoData, 'confirmGoods'))
                except:
                    self.volume = 0
                    logger.warn("Can not parse taobao item volume %s",
                                self.item_id)
            else:
                self.volume = 0
Esempio n. 10
0
def crawl():
    company_id = 9
    url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1"
    request_headers = {
        'Referee': "https://list.lufax.com/list/listing/fuying",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        loan_num = loans_json["totalCount"]
        if loans_json and loan_num:
            for i in range(0, loan_num):
                original_id = str(loans_json["data"][i]["productId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(
                        float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(
                        int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id
                    loan_obj.title = loans_json["data"][i][
                        "productNameDisplay"]
                    loan_obj.rate = str(
                        float(loans_json["data"][i]["interestRate"]) * 100)
                    period = str(loans_json["data"][i]
                                 ["investPeriodDisplay"].encode("utf-8"))
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(
                            loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(
                            loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.repayment = loans_json["data"][i][
                        "collectionModeDisplay"]
                    loan_obj.borrow_amount = str(
                        int(loans_json["data"][i]["price"]))
                    loan_obj.schedule = str(
                        float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(
                        int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Esempio n. 11
0
def crawl():
    company_id = 19
    url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0"
    request_headers = {
        'Referee': "https://www.qian360.com/tl/select.html",
        'User-Agent': DEFAULT_UA
    }

    db = get_db_engine()
    db_ids = list(
        db.execute(
            "select original_id from loan where company_id=%s and status=0",
            company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")

        if loans_json["list"]:
            for i in range(0, len(loans_json["list"])):
                if int(loans_json["list"][i]["status"]) != 1:
                    continue
                original_id = str(loans_json["list"][i]["borrowId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(
                        int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id
                    loan_obj.title = loans_json["list"][i]["name"]
                    loan_obj.rate = str(loans_json["list"][i]["apr"])
                    loan_obj.period = str(loans_json["list"][i]["totalPeriod"])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.borrow_amount = str(
                        int(loans_json["list"][i]["account"]))
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(
                        int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s",
                    company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id,
                        len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Esempio n. 12
0
def crawl():
    company_id = 19
    url = "https://www.qian360.com/bq/queryProductList.html?currentPage=1&pernum=12&type=0"
    request_headers = {'Referee': "https://www.qian360.com/tl/select.html", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")

        if loans_json["list"]:
            for i in range(0, len(loans_json["list"])):
                if int(loans_json["list"][i]["status"]) != 1:
                    continue
                original_id = str(loans_json["list"][i]["borrowId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://www.qian360.com/ti/detail.html?borrowId=%s" % original_id
                    loan_obj.title = loans_json["list"][i]["name"]
                    loan_obj.rate = str(loans_json["list"][i]["apr"])
                    loan_obj.period = str(loans_json["list"][i]["totalPeriod"])
                    loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.borrow_amount = str(int(loans_json["list"][i]["account"]))
                    loan_obj.schedule = str(loans_json["list"][i]["percent"])
                    loan_obj.cast = str(int(loans_json["list"][i]["accountYes"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Esempio n. 13
0
def crawl():
    company_id = 9
    url = "https://list.lufax.com/list/service/product/fuying-product-list/listing/1"
    request_headers = {'Referee': "https://list.lufax.com/list/listing/fuying", 'User-Agent': DEFAULT_UA}

    db = get_db_engine()
    db_ids = list(db.execute("select original_id from loan where company_id=%s and status=0", company_id))
    # db all
    db_ids_set = set()
    # 在线的所有id
    online_ids_set = set()
    # new
    new_ids_set = set()
    # update
    update_ids_set = set()

    for id in db_ids:
        db_ids_set.add(id[0].encode("utf-8"))

    # debug
    if FLAGS.debug_parser:
        import pdb

        pdb.set_trace()

    try:
        loan_htm = download_page(url, request_headers)
        loans_json = loads(loan_htm, encoding="UTF-8")
        loan_num = loans_json["totalCount"]
        if loans_json and loan_num:
            for i in range(0, loan_num):
                original_id = str(loans_json["data"][i]["productId"])
                online_ids_set.add(original_id)

                if original_id in db_ids_set:
                    update_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_update(db)
                else:
                    new_ids_set.add(original_id)

                    loan_obj = Loan(company_id, original_id)
                    loan_obj.href = "https://list.lufax.com/list/productDetail?productId=%s" % original_id
                    loan_obj.title = loans_json["data"][i]["productNameDisplay"]
                    loan_obj.rate = str(float(loans_json["data"][i]["interestRate"]) * 100)
                    period = str(loans_json["data"][i]["investPeriodDisplay"].encode("utf-8"))
                    if period.find(loan_obj.PERIOD_UNIT_DAY) > 0:
                        loan_obj.period = period.replace(loan_obj.PERIOD_UNIT_DAY, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_DAY
                    else:
                        loan_obj.period = period.replace("个", "").replace(loan_obj.PERIOD_UNIT_MONTH, "")
                        loan_obj.period_unit = loan_obj.PERIOD_UNIT_MONTH
                    loan_obj.repayment = loans_json["data"][i]["collectionModeDisplay"]
                    loan_obj.borrow_amount = str(int(loans_json["data"][i]["price"]))
                    loan_obj.schedule = str(float(loans_json["data"][i]["progress"]) * 100)
                    loan_obj.cast = str(int(loans_json["data"][i]["raisedAmount"]))
                    loan_obj.db_create(db)

        logger.info("company %s crawler loan: new size %s, update size %s", company_id, len(new_ids_set), len(update_ids_set))

        # db - 新抓取的 = 就是要下线的
        off_ids_set = db_ids_set - online_ids_set
        if off_ids_set:
            loan_obj = Loan(company_id)
            loan_obj.db_offline(db, off_ids_set)
            logger.info("company %s crawler loan: offline %s", company_id, len(off_ids_set))

    except:
        logger.error("url: %s xpath failed:%s", url, traceback.format_exc())
Esempio n. 14
0
    def crawl(self):
        try:
            self.data = self.crawl_page(self.url)
            if FLAGS.debug_parser:
                import pdb
                pdb.set_trace()

            # check tmall
            if not self.is_tmall and len(self.data) < 256 and self.url.find(
                    'item.taobao.com'
            ) > 0 and self.data.find(
                    "window.location.href='http://detail.tmall.com/item.htm'+window.location.search"
            ) > 0:
                self.data = self.crawl_page(
                    self.url.replace('item.taobao.com', 'detail.tmall.com'))

            if self.check_offline():
                self.is_offline = True

            self.html_obj = parse_html(self.data, encoding="gb18030")

            title = self.html_obj.xpath("//html/head/title/text()")
            if title and title[0].find(u"转卖") > 0:
                self.is_offline = True

            self.detailDiv = self.html_obj.xpath("//div[@id='detail']")
            self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']")
            self.originPrice = self.html_obj.xpath(
                "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()")
            if not self.originPrice:
                self.originPrice = self.html_obj.xpath(
                    "//strong[@class='J_originalPrice']/text()")
            #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()")
            self.bidPrice = self.html_obj.xpath(
                "//input[@name='current_price']/@value")
            self.thumbImages = self.html_obj.xpath(
                "//ul[@id='J_UlThumb']//img/@src")
            if not len(self.thumbImages):
                try:
                    # try load thumb images for tmall page
                    self.thumbImages = [
                        IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in
                        self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")
                    ]

                    # taobao @src to @data-src
                    if not len(self.thumbImages):
                        self.thumbImages = self.html_obj.xpath(
                            "//ul[@id='J_UlThumb']//img/@data-src")
                except:
                    logger.warn("No thumbs found %s", self.item_id)

            tblogo = self.html_obj.xpath("//*[@id='shop-logo']")
            tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']")
            if not self.is_tmall and tmalllogo:
                self.is_tmall = True

            if self.is_tmall:
                self.cid = get_val(self.data, "categoryId").split('&')[0]

                apiItemInfoUrl = get_val(self.data,
                                         "initApi").replace(r'''\/''', "/")
                self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
                try:
                    self.tmallInitApijson = loads(
                        self.tmallInitApi.decode('gb18030').encode('utf8'))
                except:
                    logger.info("parse tmall api json failed %s : %s",
                                self.item_id, traceback.format_exc())
                if self.tmallInitApijson:
                    try:
                        self.volume = self.tmallInitApijson['defaultModel'][
                            'sellCountDO']['sellCount']
                    except:
                        logger.warn("try to get volume from api failed %s",
                                    self.item_id)
                if self.volume < 0:
                    try:
                        self.volume = int(
                            get_val(self.tmallInitApi, "sellCount"))
                    except:
                        logger.warn("Can not parse item volume %s",
                                    self.item_id)

                # 库存 :icTotalQuantity
                """"
                reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                m = RATECOUNT_RE.match(reviewInfoData)
                if m:
                    self.reviewCount = m.group(1)
                else:
                    self.reviewCount = None
                """
            else:
                self.cid = get_val(self.data, "cid")

                apiItemInfoVal = get_val(self.data, "apiItemInfo")
                if apiItemInfoVal:
                    apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(
                        r'''\/''', "/")
                    itemInfoData = self.crawl_page(apiItemInfoUrl)
                    try:
                        self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    except:
                        self.volume = -1
                else:
                    self.volume = -1

                #interval = get_val(data2, 'interval')
                # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/")
                """
                reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                self.reviewCount = get_val(reviewInfoData, 'total')
                """
        except:
            logger.error("crawling %s unknown exception %s",
                         self.item_id,
                         traceback.format_exc(),
                         extra={'tags': [
                             'crawlItemException',
                         ]})
            raise
Esempio n. 15
0
    def crawl(self):
        try:
            self.data = self.crawl_page(self.url)
            if FLAGS.debug_parser:
                import pdb; pdb.set_trace()

            # check tmall
            if not self.is_tmall and len(self.data) < 256 and self.url.find('item.taobao.com') > 0 and self.data.find("window.location.href='http://detail.tmall.com/item.htm'+window.location.search") > 0:
                self.data = self.crawl_page(self.url.replace('item.taobao.com', 'detail.tmall.com'))

            if self.check_offline():
                self.is_offline = True

            self.html_obj = parse_html(self.data, encoding="gb18030")

            title = self.html_obj.xpath("//html/head/title/text()")
            if title and title[0].find(u"转卖") > 0:
                self.is_offline = True

            self.detailDiv = self.html_obj.xpath("//div[@id='detail']")
            self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']")
            self.originPrice = self.html_obj.xpath("//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()")
            if not self.originPrice:
                self.originPrice = self.html_obj.xpath("//strong[@class='J_originalPrice']/text()")
            #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()")
            self.bidPrice = self.html_obj.xpath("//input[@name='current_price']/@value")
            self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
            if not len(self.thumbImages):
                try:
                    # try load thumb images for tmall page
                    self.thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")]

                    # taobao @src to @data-src
                    if not len(self.thumbImages):
                        self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src")
                except:
                    logger.warn("No thumbs found %s", self.item_id)

            tblogo = self.html_obj.xpath("//*[@id='shop-logo']")
            tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']")
            if not self.is_tmall and tmalllogo:
                self.is_tmall = True

            if self.is_tmall:
                self.cid = get_val(self.data, "categoryId").split('&')[0]

                apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/")
                self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
                try:
                    self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8'))
                except:
                    logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc())
                if self.tmallInitApijson:
                    try:
                        self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount']
                    except:
                        logger.warn("try to get volume from api failed %s", self.item_id)
                if self.volume < 0:
                    try:
                        self.volume = int(get_val(self.tmallInitApi, "sellCount"))
                    except:
                        logger.warn("Can not parse item volume %s", self.item_id)

                # 库存 :icTotalQuantity
                """"
                reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                m = RATECOUNT_RE.match(reviewInfoData)
                if m:
                    self.reviewCount = m.group(1)
                else:
                    self.reviewCount = None
                """
            else:
                self.cid = get_val(self.data, "cid")

                apiItemInfoVal = get_val(self.data, "apiItemInfo")
                if apiItemInfoVal:
                    apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/")
                    itemInfoData = self.crawl_page(apiItemInfoUrl)
                    try:
                        self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    except:
                        self.volume = -1
                else:
                    self.volume = -1

                #interval = get_val(data2, 'interval')
                # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/")
                """
                reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                self.reviewCount = get_val(reviewInfoData, 'total')
                """
        except:
            logger.error("crawling %s unknown exception %s", self.item_id, traceback.format_exc(), extra={'tags':['crawlItemException',]})
            raise