Python TaobaoShop Beispiele

Programmiersprache: Python

Namespace / Paketname: tb_items

Klasse / Typ: TaobaoShop

Beispiele auf hotexamples.com: 2

Python TaobaoShop - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die tb_items.TaobaoShop, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

output(1)

Beispiel #1

Datei anzeigen

Datei: tb_spider.py Projekt: spartacus429496/mj_tb_spy1

    def shop_basic_info_parse(self):
        resp_url = self.driver.current_url

        if ("noshop.htm" in resp_url) or ("error1.html" in resp_url) \
            or ("guang.taobao" in resp_url):
            # 当前url无效， 获取下一个任务
            print u"无此店铺"
            return Request(self.next_task_url(), self.shop_basic_info_parse)
        else:
            t = self.driver.find_elements(By.XPATH, '//title')
            if len(t) == 0:
                print "页面错误"
                return Request(self.next_task_url(), self.shop_basic_info_parse)

            print u"解析店铺主页面 %s" % resp_url

            t = t[0].get_attribute("innerHTML")

            self.shop = TaobaoShop(shop_number=self.current_shopid, shop_url=resp_url)

            if not t:
                print "解析主页面错误"
                return Request(self.next_task_url(), self.shop_basic_info_parse)

            if  t.find(u"Tmall.com") >= 0:
                self.shop.shop_classify = 1
                n = self.driver.find_element_by_xpath("//a[@class='slogo-shopname']/*").get_attribute("innerHTML")
                l = self.driver.find_element_by_xpath("//*[@id='dsr-ratelink']").get_attribute("value")

                if (not n) or (not l):
                    print "解析天猫店铺基本信息错误"
                    return Request(self.next_task_url(), self.shop_basic_info_parse)

                self.shop.shop_name = n[0]
                self.shop.shop_detail_url = "https:" + l[0]
                # return Request(l, self.parse_tmall_detail)
                return

            else:
                url = self.driver.find_elements(By.XPATH, '//span[@class="shop-rank"]//a')
                if len(url) == 0:
                    return Request(self.next_task_url(), self.shop_basic_info_parse)

                url = url[0].get_attribute("href")

                self.shop.shop_classify = 0
                self.shop.shop_detail_url = url

                shop_name = self.driver.find_elements(By.XPATH, "//a[@class='shop-name']/*")
                if len(shop_name) == 0:
                    return Request(self.next_task_url(), self.shop_basic_info_parse)

                self.shop.shop_name = shop_name[0].get_attribute("innerHTML")
                # self.shop.output()
                # return Request(self.next_task_url(), self.shop_basic_info_parse)
                return Request(url, self.parse_tbshop_detail)

Beispiel #2

Datei anzeigen

Datei: tb_spider.py Projekt: spartacus429496/mj_tb_spy1

class TbSpider():
    login_URL = 'https://login.taobao.com/member/login.jhtml'
    current_shopid = 36326929
    drive_choose = 2
    last_url = ''

    def __init__(self):
        if self.drive_choose==1:
            self.driver = webdriver.Firefox()
        elif self.drive_choose==2:
            self.driver = webdriver.PhantomJS()

        # self.driver = webdriver.PhantomJS()
        # self.driver = webdriver.Firefox()
        # self.driver = webdriver.PhantomJS(service_args=['--load-images=no'])

        self.driver.set_window_size(1024, 768)
        # self.driver.set_page_load_timeout(10)
        # self.driver.set_script_timeout(1)
        self.htmlparser = HTMLParser.HTMLParser()

    def run(self):
        success = self.login()
        if success:
            self.start_crawl_task(Request(self.next_task_url(), self.shop_basic_info_parse))

    def get_checkcode(self):
        checkcode = ''
        try:
            checkcode_url = self.driver.find_element_by_id("J_StandardCode_m").get_attribute('data-src')
            checkcode_url_1 = self.driver.find_element_by_id("J_StandardCode_m").get_attribute('src')
        except:
            print "验证码链接提取错误"

        if checkcode_url:
            print "src %s" % checkcode_url_1
            print checkcode_url
            checkcode = raw_input("请输入验证码:")

        return checkcode

    def submit_login_form(self, username, password):
        username_el = self.driver.find_element_by_id("TPL_username_1")
        password_el = self.driver.find_element_by_id("TPL_password_1")

        username_el.clear()
        username_el.send_keys(username)

        password_el.clear()
        password_el.send_keys(password)

        time.sleep(1)

        self.driver.save_screenshot('checkcode.png')
        checkcode = self.get_checkcode()

        if checkcode:
            self.driver.find_element_by_id("J_CodeInput_i").send_keys(checkcode)

        self.driver.find_element_by_id("J_SubmitStatic").submit()
        # self.driver.save_screenshot('after_submit.png')

        ret = False
        for i in xrange(10, 0, -1):
            time.sleep(1)
            print i
            if not (self.login_URL in self.driver.current_url):
                ret = True
                break 

        return ret

    # def login(username='******', password='******'):
    def login(self, username='******', password='******'):
        print '开始登录....'

        self.driver.get(self.login_URL)

        success = self.submit_login_form(username, password)

        while not success:
            try:
                err_msg = self.driver.find_element_by_xpath("//div[@id='J_Message']//p[@class='error']").text
                if err_msg:
                    print u"页面错误信息:"
                    print err_msg
            except:
                if not (self.login_URL in self.driver.current_url):
                    success = True
                    break

            a = raw_input("本次登录不成功，是否继续? N/y:")
            if not a == 'Y' and not a == 'y':
                break

            success = self.submit_login_form(username, password)

        if not success:
            print '登录失败'
            return False
        else:
            print "登录成功"
            print u"重定向到: %s" % self.driver.current_url
            return True

        # self.driver.quit()

    def start_crawl_task(self, request):
        # self.driver.set_page_load_timeout(5)
        while request:
            try:
                self.driver.get(request.url)
            except:
                print "获取页面超时"
                time.sleep(2)
                # request = Request(self.next_task_url(), self.shop_basic_info_parse)
            request = request.callback()

    def next_task_url(self):
        self.current_shopid += 1
        url = "https://shop%ld.taobao.com" % self.current_shopid
        print "下一个店铺的地址 %s" % url
        return url

    def current_task_url(self):
        # self.current_shopid += 1
        url = "https://shop%ld.taobao.com" % self.current_shopid
        print "当前店铺的地址 %s" % url
        return url

    def shop_basic_info_parse(self):
        resp_url = self.driver.current_url

        if ("noshop.htm" in resp_url) or ("error1.html" in resp_url) \
            or ("guang.taobao" in resp_url):
            # 当前url无效， 获取下一个任务
            print u"无此店铺"
            return Request(self.next_task_url(), self.shop_basic_info_parse)
        else:
            t = self.driver.find_elements(By.XPATH, '//title')
            if len(t) == 0:
                print "页面错误"
                return Request(self.next_task_url(), self.shop_basic_info_parse)

            print u"解析店铺主页面 %s" % resp_url

            t = t[0].get_attribute("innerHTML")

            self.shop = TaobaoShop(shop_number=self.current_shopid, shop_url=resp_url)

            if not t:
                print "解析主页面错误"
                return Request(self.next_task_url(), self.shop_basic_info_parse)

            if  t.find(u"Tmall.com") >= 0:
                self.shop.shop_classify = 1
                n = self.driver.find_element_by_xpath("//a[@class='slogo-shopname']/*").get_attribute("innerHTML")
                l = self.driver.find_element_by_xpath("//*[@id='dsr-ratelink']").get_attribute("value")

                if (not n) or (not l):
                    print "解析天猫店铺基本信息错误"
                    return Request(self.next_task_url(), self.shop_basic_info_parse)

                self.shop.shop_name = n[0]
                self.shop.shop_detail_url = "https:" + l[0]
                # return Request(l, self.parse_tmall_detail)
                return

            else:
                url = self.driver.find_elements(By.XPATH, '//span[@class="shop-rank"]//a')
                if len(url) == 0:
                    return Request(self.next_task_url(), self.shop_basic_info_parse)

                url = url[0].get_attribute("href")

                self.shop.shop_classify = 0
                self.shop.shop_detail_url = url

                shop_name = self.driver.find_elements(By.XPATH, "//a[@class='shop-name']/*")
                if len(shop_name) == 0:
                    return Request(self.next_task_url(), self.shop_basic_info_parse)

                self.shop.shop_name = shop_name[0].get_attribute("innerHTML")
                # self.shop.output()
                # return Request(self.next_task_url(), self.shop_basic_info_parse)
                return Request(url, self.parse_tbshop_detail)

    def parse_tbshop_detail(self):
        print u"解析店铺详细信息 %s" % self.driver.current_url

        if "anti_Spider" in self.driver.current_url:
            print "Oohs!"
            return

        # 解析好评率
        rate = self.driver.find_element_by_xpath("//*[@class='tb-rate-ico-bg ico-seller']/em").get_attribute("innerHTML")
        if rate:
            rate = rate.split(u"：")
            if len(rate) == 2:
                try:
                    rate = string.atof(rate[1][:-1])
                except:
                    print "malform string: %s" % rate
                else:
                    self.shop.shop_credit = rate
        else:
            print "解析好评率错误"

        # 解析建店时间
        startdate = self.driver.find_element_by_xpath("//input[@id='J_showShopStartDate']").get_attribute("value")

        if startdate:
            self.shop.shop_create_time = startdate
        else:
            print "解析建店时间错误"

        # 解析店主名称
        owner = self.driver.find_element_by_xpath("//div[@class='title']/a").get_attribute("innerHTML")
        if owner:
            self.shop.shop_owner = owner
        else:
            print "解析店主名称错误"

        # 解析当前主营
        trade_key = u"当前主营"
        # 解析所在地区
        location_key = u"所在地区"

        els = re.findall("<li>(.*)</li>", self.driver.page_source)
        for el in els:
            if el.find(trade_key) >=0:
                el = re.findall("<a href=\".*\">(.*)</a>", el)
                if len(el) == 1:
                    el = self.htmlparser.unescape(el[0]) # 去掉HTML的转义字符
                    self.shop.shop_trade_range = el.strip()
                else:
                    print "解析当前主营错误"

            elif el.find(location_key) >= 0:
                el = el.split(u"：")
                if len(el) == 2:
                    self.shop.shop_location = self.htmlparser.unescape(el[1]).strip()
                else:
                    print "解析所在地区错误"
                break

        # 解析卖家信用
        credit_key = u"卖家信用："
        start = self.driver.page_source.find(credit_key)
        if start >= 0:
            start += len(credit_key)
            end = start
            while 1:
                if self.driver.page_source[end] in '\r\n\<\ ':
                    break
                end += 1
            try:
                credit = string.atoi(self.driver.page_source[start:end].strip())
                self.shop.shop_credit = credit
            except:
                print u"解析信用错误"

        if self.shop.shop_location == "":
            # 如果在前面没有解析出地址(可能是店家没有写),
            # 就从这家店铺商品的收货地址取得其店铺地址
            url = "https://list.taobao.com/itemlist/default.htm?nick=%s&_input_charset=utf-8&json=on&callback=jsonp161" \
                   % self.shop.shop_owner.encode("utf8")
            return Request(url, self.parse_location_by_shipping_addr)
        else:
            self.shop.output()

            if self.shop.shop_location.find(u"合肥") < 0:
                return Request(self.next_task_url(), self.shop_basic_info_parse)

            # url = "https://list.taobao.com/itemlist/default.htm?nick=%s&style=list&_input_charset=utf-8&json=on&callback=jsonp731" \
            #        % self.shop['shop_owner'].encode("utf8")
            # return Request(url, callback=self.parse_commodity)
            return Request(self.next_task_url(), self.shop_basic_info_parse)
            # yield self.shop

    # 通过店铺商品的收货地址来得到店铺地址
    def parse_location_by_shipping_addr(self):
        print "parse_location_by_shipping_addr"
        
        resp_url = self.driver.current_url
        if "anti_Spider" in resp_url:
            print "Oohs ！！！遭遇反爬虫"
            return Request(self.next_task_url(), self.shop_basic_info_parse)
        else:
            json_set = self.decode_tb_json(self.driver.page_source)
            if "itemList" in json_set:
                itemlist = json_set['itemList']
                print "==========================="
                if itemlist != None and len(itemlist) != 0:
                    self.shop.shop_location = itemlist[0]['loc'].strip()

            self.shop.output()
            return Request(self.next_task_url(), self.shop_basic_info_parse)

    def decode_tb_json(self, data):
        json_set = set()

        start = data.find("(")
        end =   data.rfind(")")
        if start < 0 or end < 0:
            print "无法识别json文件"
        else:
            try:
                data = data[start+1:end]
            except:
                print "json 字符集错误"
            else:
                json_set = json.loads(data)
        return json_set