Python WebdriverXPathSelectorの例、scrapy_webdriver.selector.WebdriverXPathSelector Pythonの例

コード例 #1

0

ファイルを表示

 def parse_smzdm_list_page(self, response):
     try:
         category = response.meta["category"]
         sel = WebdriverXPathSelector(response)
         item_url_sel_list = sel.select(
             "/html/body/section//div[@class='listTitle']/h3[@class='itemName']/a/@href"
         )
         for item_url_sel in item_url_sel_list:
             item_url = item_url_sel.extract()
             if item_url not in self.urls_seen:
                 yield WebdriverRequest(item_url,
                                        meta={'category': category},
                                        callback=self.parse_smzdm_item_page)
             # else:
             #     raise StopIteration
         next_page_xpath = "//li[@class='pagedown']/a/@href"
         next_page_url_sel_list = sel.select(next_page_xpath)
         for next_page_url_sel in next_page_url_sel_list:
             next_page_url = next_page_url_sel.extract()
             yield WebdriverRequest(next_page_url,
                                    meta={'category': category},
                                    callback=self.parse_smzdm_list_page)
     except:
         log.msg("Smzdm list page parse failed:\t[%s]" % (response.url),
                 level=log.ERROR,
                 spider=SmzdmSpider)
         raise StopIteration

コード例 #2

0

ファイルを表示

ファイル: shopping_site_spider.py プロジェクト: briscula/shopping-robot

 def parse_smzdm_item_page(self, response):
     try:
         category = response.meta["category"]
         sel = WebdriverXPathSelector(response)
         title_sel_list = sel.select('/html/body/section/div[1]/article/h1')
         attachment_sel_list = sel.select('/html/body/section/div[1]/article/h1/span')
         if len(title_sel_list):
             title = self.normalize_text(title_sel_list[0].extract())
             item_name = title
         else:
             log.msg("Smzdm title parse failed:\t[%s]" % (response.url) , level=log.ERROR, spider=SmzdmSpider)
             raise StopIteration
         all_attachment = ''
         for attachment_sel in attachment_sel_list:
             attachment = attachment_sel.extract()
             item_name = item_name.replace(attachment, '')
             all_attachment += attachment
         price, currency = self.parse_price(all_attachment)
         item_shopping_url_sel_list = sel.select("/html/body/section/div[1]/article/div[2]/div/div/a/@href")
         if len(item_shopping_url_sel_list):
             item_shopping_url = item_shopping_url_sel_list[0].extract()
             yield WebdriverRequest(item_shopping_url, meta={'referer': response.url}, callback=self.parse_shopping_item_page)
         description_sel_list = sel.select('/html/body/section/div[1]/article/div[2]/p[@itemprop="description"]')
         description = ''
         img_src_list = []
         for description_sel in description_sel_list:
             description += self.normalize_text(description_sel.extract())
             img_src_sel_list = description_sel.select(".//img/@src")
             for img_src_sel in img_src_sel_list:
                 img_src_list.append(img_src_sel.extract())
         try:
             worthy_vote = int(self.get_text_by_xpath(sel, "//span[@id='rating_worthy_num']/text()"))
         except:
             worthy_vote = 0
         try:
             unworthy_vote = int(self.get_text_by_xpath(sel, "//span[@id='rating_unworthy_num']/text()"))
         except:
             unworthy_vote = 0
         try:
             favorite_count = int(self.get_text_by_xpath(sel, "//a[@class='fav']/em/text()"))
         except:
             favorite_count = 0
         try:
             comment_count = int(self.get_text_by_xpath(sel, "//a[@class='comment']/em/text()"))
         except:
             comment_count = 0
         yield items.SmzdmItem(title=item_name, price=price, url=response.url, description=description, \
                               image_urls=img_src_list, worthy_vote=worthy_vote, unworthy_vote=unworthy_vote, \
                               favorite_count=favorite_count, comment_count=comment_count, category=category, currency=currency)
     except:
         log.msg("Smzdm item page parse failed:\t[%s]" % (response.url) , level=log.ERROR, spider=SmzdmSpider)
         raise StopIteration

コード例 #3

0

ファイルを表示

 def submit_login_info(self, response):
     sel = WebdriverXPathSelector(response)
     sel.select('//*[@id="ap_email"]')[0].send_keys(self.formdata["email"])
     sel.select('//*[@id="ap_password"]')[0].send_keys(
         self.formdata["password"])
     sel.select('//*[@id="signInSubmit"]')[0].click()
     self.check_login(response)
     time.sleep(1)

コード例 #4

0

ファイルを表示

ファイル: smzdm_spider.py プロジェクト: briscula/shopping-robot

 def order_item(self, response):
     sel = WebdriverXPathSelector(response)
     one_click_button_list = sel.select('//*[@id="one-click-button"]')
     if not one_click_button_list:
         log.msg("Need to enable one click order!", level=log.DEBUG, spider=SmzdmSpider)
         referer = response.meta["referer"]
         enable_one_click_url_sel = sel.select('//*[@id="oneClickSignIn"]/a/span')
         if enable_one_click_url_sel:
             # enable_one_click_url = enable_one_click_url_sel[0].extract()
             log.msg("Enable one click order", level=log.DEBUG, spider=SmzdmSpider)
             enable_one_click_url_sel[0].click()
             WebdriverActionRequest(response, \
                     actions=ActionChains(response.webdriver).click(enable_one_click_url_sel[0]), \
                     callback=self.parse_shopping_item_page)
     else:
         log.msg("One click order!", level=log.INFO, spider=SmzdmSpider)
         one_click_button_list[0].click()

コード例 #5

0

ファイルを表示

ファイル: smzdm_spider.py プロジェクト: briscula/shopping-robot

 def submit_login_info(self, response):
     sel = WebdriverXPathSelector(response)
     sel.select('//*[@id="ap_email"]')[0].send_keys(self.formdata["email"])
     sel.select('//*[@id="ap_password"]')[0].send_keys(self.formdata["password"])
     sel.select('//*[@id="signInSubmit"]')[0].click()
     self.check_login(response)
     time.sleep(1)

コード例 #6

0

ファイルを表示

ファイル: shopping_site_spider.py プロジェクト: briscula/shopping-robot

 def parse_smzdm_list_page(self, response):
     try:
         category = response.meta["category"]
         sel = WebdriverXPathSelector(response)
         item_url_sel_list = sel.select("/html/body/section//div[@class='listTitle']/h3[@class='itemName']/a/@href")
         for item_url_sel in item_url_sel_list:
             item_url = item_url_sel.extract()
             if item_url not in self.urls_seen:
                 yield WebdriverRequest(item_url, meta={'category': category}, callback=self.parse_smzdm_item_page)
             # else:
             #     raise StopIteration
         next_page_xpath = "//li[@class='pagedown']/a/@href"
         next_page_url_sel_list = sel.select(next_page_xpath)
         for next_page_url_sel in next_page_url_sel_list:
             next_page_url = next_page_url_sel.extract()
             yield WebdriverRequest(next_page_url, meta={'category': category}, callback=self.parse_smzdm_list_page)
     except:
         log.msg("Smzdm list page parse failed:\t[%s]" % (response.url) , level=log.ERROR, spider=SmzdmSpider)
         raise StopIteration

コード例 #7

0

ファイルを表示

 def order_item(self, response):
     sel = WebdriverXPathSelector(response)
     one_click_button_list = sel.select('//*[@id="one-click-button"]')
     if not one_click_button_list:
         log.msg("Need to enable one click order!",
                 level=log.DEBUG,
                 spider=SmzdmSpider)
         referer = response.meta["referer"]
         enable_one_click_url_sel = sel.select(
             '//*[@id="oneClickSignIn"]/a/span')
         if enable_one_click_url_sel:
             # enable_one_click_url = enable_one_click_url_sel[0].extract()
             log.msg("Enable one click order",
                     level=log.DEBUG,
                     spider=SmzdmSpider)
             enable_one_click_url_sel[0].click()
             WebdriverActionRequest(response, \
                     actions=ActionChains(response.webdriver).click(enable_one_click_url_sel[0]), \
                     callback=self.parse_shopping_item_page)
     else:
         log.msg("One click order!", level=log.INFO, spider=SmzdmSpider)
         one_click_button_list[0].click()

コード例 #8

0

ファイルを表示

    def parse_shopping_item_page(self, response):
        try:
            sel = WebdriverXPathSelector(response)
            referer = response.meta["referer"]
            jd_jump_url_sel = sel.select(
                "/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href")
            if jd_jump_url_sel:
                log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                yield WebdriverRequest(jd_jump_url_sel[0].extract(),
                                       meta={'referer': referer},
                                       callback=self.parse_shopping_item_page)
            else:
                img_src_list = []
                description = ""
                title = ""
                price = -1.0
                log.msg("Shopping url: %s" % (response.url),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                log.msg("Real shopping url: %s" %
                        (response.webdriver.current_url),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                url = response.webdriver.current_url
                for url_pattern, (title_xpath, price_xpath,
                                  price_redudant_pattern, description_xpath,
                                  description_img_xpath, currency,
                                  title_img_xpath_list
                                  ) in self.__url_pattern_xpath_dict.items():
                    if url_pattern.match(url):
                        log.msg("Shopping url pattern is found",
                                level=log.DEBUG,
                                spider=SmzdmSpider)
                        title_sel_list = sel.select(title_xpath)
                        if len(title_sel_list):
                            title = self.normalize_text(
                                title_sel_list[0].extract())
                        else:
                            log.msg("Shopping page error:\ttitle is not found",
                                    level=log.ERROR,
                                    spider=SmzdmSpider)
                            raise StopIteration
                            continue
                        price_sel_list = sel.select(price_xpath)
                        if len(price_sel_list):
                            price_text = price_sel_list[0].extract()
                            price_text = price_redudant_pattern.sub(
                                '', price_text)
                            try:
                                price = float(price_text)
                                if url.startswith("http://www.kiddies24.de"):
                                    price /= 100
                            except:
                                log.msg(
                                    "Shopping page error:\tThis item is sold out, the price is %s"
                                    % (price),
                                    level=log.WARNING,
                                    spider=SmzdmSpider)
                        else:
                            log.msg("Shopping page error:\tprice is not found",
                                    level=log.WARNING,
                                    spider=SmzdmSpider)
                        title_img_sel_list = []
                        for title_img_xpath in title_img_xpath_list:
                            title_img_sel_list += sel.select(title_img_xpath)
                        title_img_src = ""
                        for title_img_sel in title_img_sel_list:
                            title_img_src = title_img_sel.extract()
                            if title_img_src:
                                img_src_list.append(title_img_src)
                                break
                        # if url_pattern.match('http://www.amazon.'):
                        #     try:
                        #         WebDriverWait(response.webdriver, 10) \
                        #             .until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, '//iframe[@id="product-description-iframe"]')))
                        #     except:
                        #         log.msg("Shopping page error:\tFrame in Amazon is not found", level=log.ERROR, spider=SmzdmSpider)
                        #
                        # description_sel_list = sel.select(description_xpath + "/*")
                        # for description_sel in description_sel_list:
                        #     description_part = self.normalize_text(description_sel.extract())
                        #     if description_part:
                        #         description += description_part + '\t'

                        # description_img_sel_list = sel.select(description_img_xpath)
                        # """ Run func with the given timeout. If func didn't finish running
                        #     within the timeout, raise TimeLimitExpired
                        # """
                        # import threading
                        # class GetImgSrcThread(threading.Thread):
                        #     def __init__(self, driver, sel_list):
                        #         threading.Thread.__init__(self)
                        #         self.__driver = driver
                        #         self.__sel_list = sel_list
                        #     def run(self):
                        #         for sel in self.__sel_list:
                        #             try:
                        #                 self.__driver.execute_script("arguments[0].scrollIntoView(true);", sel.element)
                        #                 time.sleep(1)
                        #             except:
                        #                 log.msg("Shopping page error:\tscrollIntoView failed", level=log.ERROR, spider=SmzdmSpider)
                        #                 img_src_sel_list = sel.select("./@src")
                        #                 for img_src_sel in img_src_sel_list:
                        #                     log.msg("Shopping page error:\timage %s is not found" % (img_src_sel.extract()), level=log.ERROR, spider=SmzdmSpider)
                        #                 continue
                        # it = GetImgSrcThread(response.webdriver, description_img_sel_list)
                        # it.start()
                        # it.join(60)
                        # if it.isAlive():
                        #     break
                        # description_img_sel_list = sel.select(description_img_xpath + "/@src")
                        # log.msg("Shopping description img list: %s[%d]" % (description_img_sel_list, len(description_img_sel_list)) , level=log.DEBUG, spider=SmzdmSpider)
                        # for description_img_sel in description_img_sel_list:
                        #     img_src = description_img_sel.extract()
                        #     if img_src:
                        #         img_src_list.append(img_src)
                        log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" %
                                (title, description, price, url, referer),
                                level=log.DEBUG,
                                spider=SmzdmSpider)
                        yield items.ShoppingItem(title=title,
                                                 price=price,
                                                 url=url,
                                                 referer=referer,
                                                 image_urls=img_src_list,
                                                 title_image_url=title_img_src,
                                                 description=description,
                                                 currency=currency)
        except:
            log.msg("Shopping item page parse failed:\t[%s]" % (response.url),
                    level=log.ERROR,
                    spider=SmzdmSpider)
            raise StopIteration

コード例 #9

0

ファイルを表示

 def parse_smzdm_item_page(self, response):
     try:
         category = response.meta["category"]
         sel = WebdriverXPathSelector(response)
         title_sel_list = sel.select('/html/body/section/div[1]/article/h1')
         attachment_sel_list = sel.select(
             '/html/body/section/div[1]/article/h1/span')
         if len(title_sel_list):
             title = self.normalize_text(title_sel_list[0].extract())
             item_name = title
         else:
             log.msg("Smzdm title parse failed:\t[%s]" % (response.url),
                     level=log.ERROR,
                     spider=SmzdmSpider)
             raise StopIteration
         all_attachment = ''
         for attachment_sel in attachment_sel_list:
             attachment = attachment_sel.extract()
             item_name = item_name.replace(attachment, '')
             all_attachment += attachment
         price, currency = self.parse_price(all_attachment)
         item_shopping_url_sel_list = sel.select(
             "/html/body/section/div[1]/article/div[2]/div/div/a/@href")
         if len(item_shopping_url_sel_list):
             item_shopping_url = item_shopping_url_sel_list[0].extract()
             yield WebdriverRequest(item_shopping_url,
                                    meta={'referer': response.url},
                                    callback=self.parse_shopping_item_page)
         description_sel_list = sel.select(
             '/html/body/section/div[1]/article/div[2]/p[@itemprop="description"]'
         )
         description = ''
         img_src_list = []
         for description_sel in description_sel_list:
             description += self.normalize_text(description_sel.extract())
             img_src_sel_list = description_sel.select(".//img/@src")
             for img_src_sel in img_src_sel_list:
                 img_src_list.append(img_src_sel.extract())
         try:
             worthy_vote = int(
                 self.get_text_by_xpath(
                     sel, "//span[@id='rating_worthy_num']/text()"))
         except:
             worthy_vote = 0
         try:
             unworthy_vote = int(
                 self.get_text_by_xpath(
                     sel, "//span[@id='rating_unworthy_num']/text()"))
         except:
             unworthy_vote = 0
         try:
             favorite_count = int(
                 self.get_text_by_xpath(sel, "//a[@class='fav']/em/text()"))
         except:
             favorite_count = 0
         try:
             comment_count = int(
                 self.get_text_by_xpath(sel,
                                        "//a[@class='comment']/em/text()"))
         except:
             comment_count = 0
         yield items.SmzdmItem(title=item_name, price=price, url=response.url, description=description, \
                               image_urls=img_src_list, worthy_vote=worthy_vote, unworthy_vote=unworthy_vote, \
                               favorite_count=favorite_count, comment_count=comment_count, category=category, currency=currency)
     except:
         log.msg("Smzdm item page parse failed:\t[%s]" % (response.url),
                 level=log.ERROR,
                 spider=SmzdmSpider)
         raise StopIteration

コード例 #10

0

ファイルを表示

ファイル: smzdm_spider.py プロジェクト: briscula/shopping-robot

    def parse_shopping_item_page(self, response):
        try:
            sel = WebdriverXPathSelector(response)
            referer = response.meta["referer"]
            target_price = response.meta["target_price"]
            jd_jump_url_sel = sel.select("/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href")
            if jd_jump_url_sel:
                log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()) , level=log.DEBUG, spider=SmzdmSpider)
                yield WebdriverRequest(jd_jump_url_sel[0].extract(), meta={'referer': referer}, callback=self.parse_shopping_item_page)
            else:
                img_src_list = []
                comment_list = []
                description = ""
                title = ""
                vote_count = ""
                vote_score = ""
                price = -1.0
                log.msg("Shopping url: %s" % (response.url), level=log.DEBUG, spider=SmzdmSpider)
                log.msg("Real shopping url: %s" % (response.webdriver.current_url), level=log.DEBUG, spider=SmzdmSpider)
                url = response.webdriver.current_url
                hostname = urlparse(url).hostname
                if hostname != "www.amazon.cn":
                    log.msg("Shopping robot does not support this site", level=log.INFO, spider=SmzdmSpider)
                    return
                for url_pattern, (title_xpath, price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list, comment_xpath, vote_count_xpath, vote_score_xpath) in self.__url_pattern_xpath_dict.items():
                    if url_pattern.match(url):
                        log.msg("Shopping url pattern is found", level=log.DEBUG, spider=SmzdmSpider)
                        title_sel_list = sel.select(title_xpath)
                        if len(title_sel_list):
                            title = self.normalize_text(title_sel_list[0].extract())
                        else:
                            log.msg("Shopping page error:\ttitle is not found", level=log.ERROR, spider=SmzdmSpider)
                            raise StopIteration
                            continue
                        price_sel_list = sel.select(price_xpath)
                        if len(price_sel_list):
                            price_text = price_sel_list[0].extract()
                            price_text = price_redudant_pattern.sub('', price_text)
                            try:
                                price = float(price_text)
                                if url.startswith("http://www.kiddies24.de"):
                                    price /= 100
                                if (price - target_price) / target_price > 0.05:
                                    log.msg("Price is not ideal. (current price: %f, target price: %f)" % (price, target_price), level=log.INFO, spider=SmzdmSpider)
                                    return
                            except:
                                traceback.print_exc()
                                log.msg("Shopping page error:\tThis item is sold out, the price is %s" % (price), level=log.WARNING, spider=SmzdmSpider)
                        else:
                            log.msg("Shopping page error:\tprice is not found", level=log.WARNING, spider=SmzdmSpider)
                        title_img_sel_list = []
                        for title_img_xpath in title_img_xpath_list:
                            title_img_sel_list += sel.select(title_img_xpath)
                        title_img_src = ""
                        for title_img_sel in title_img_sel_list:
                            title_img_src = title_img_sel.extract()
                            if title_img_src:
                                img_src_list.append(title_img_src)
                                break
                        if hostname == "item.jd.com":
                            # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-0']"))
                            # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-2']"))
                            # sel.webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                            sel.webdriver.find_element_by_xpath("//li[@id='detail-tab-comm']/a").click()
                            time.sleep(2)
                        for comment_sel in sel.select(comment_xpath):
                            comment_list.append(comment_sel.extract())
                        vote_count_sel_list = sel.select(vote_count_xpath)
                        if len(vote_count_sel_list):
                            vote_count = vote_count_sel_list[0].extract()
                        else:
                            log.msg("Shopping page error:\tvote count is not found", level=log.ERROR, spider=SmzdmSpider)
                        vote_score_sel_list = sel.select(vote_score_xpath)
                        if len(vote_score_sel_list):
                            vote_score = vote_score_sel_list[0].extract()
                        else:
                            log.msg("Shopping page error:\tvote score is not found", level=log.ERROR, spider=SmzdmSpider)
                        log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" % (title, description, price, url, referer) , level=log.DEBUG, spider=SmzdmSpider)
                        yield items.ShoppingItem(title=title, price=price, url=url, referer=referer, image_urls=img_src_list, \
                                title_image_url=title_img_src, description=description, currency=currency, \
                                comment_list=comment_list, vote_count=vote_count, vote_score=vote_score)
                        log.msg("Place the order!", level=log.INFO, spider=SmzdmSpider)

                        sel = WebdriverXPathSelector(response)
                        one_click_button_list = sel.select('//*[@id="one-click-button"]')
                        if not one_click_button_list:
                            log.msg("Need to enable one click order!", level=log.DEBUG, spider=SmzdmSpider)
                            referer = response.meta["referer"]
                            enable_one_click_url_sel = response.webdriver.find_elements_by_xpath('//*[@id="oneClickSignIn"]')
                            if enable_one_click_url_sel:
                                # enable_one_click_url = enable_one_click_url_sel[0].extract()
                                log.msg("Enable one click order", level=log.DEBUG, spider=SmzdmSpider)
                                # enable_one_click_url_sel[0].click()
                                yield WebdriverActionRequest(response, \
                                        actions=ActionChains(response.webdriver).click(enable_one_click_url_sel[0]), \
                                        meta={'referer': referer}, \
                                        callback=self.parse_shopping_item_page)
                        else:
                            log.msg("One click order!", level=log.INFO, spider=SmzdmSpider)
                            one_click_button_list[0].click()

                        # self.order_item(response)
                        # time.sleep(1)
        except:
            traceback.print_exc()
            log.msg("Shopping item page parse failed:\t[%s]" % (response.url) , level=log.ERROR, spider=SmzdmSpider)
            raise StopIteration

コード例 #11

0

ファイルを表示

ファイル: shopping_site_spider.py プロジェクト: briscula/shopping-robot

    def parse_shopping_item_page(self, response):
        try:
            sel = WebdriverXPathSelector(response)
            referer = response.meta["referer"]
            jd_jump_url_sel = sel.select("/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href")
            if jd_jump_url_sel:
                log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()) , level=log.DEBUG, spider=SmzdmSpider)
                yield WebdriverRequest(jd_jump_url_sel[0].extract(), meta={'referer': referer}, callback=self.parse_shopping_item_page)
            else:
                img_src_list = []
                description = ""
                title = ""
                price = -1.0
                log.msg("Shopping url: %s" % (response.url), level=log.DEBUG, spider=SmzdmSpider)
                log.msg("Real shopping url: %s" % (response.webdriver.current_url), level=log.DEBUG, spider=SmzdmSpider)
                url = response.webdriver.current_url
                for url_pattern, (title_xpath, price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list) in self.__url_pattern_xpath_dict.items():
                    if url_pattern.match(url):
                        log.msg("Shopping url pattern is found", level=log.DEBUG, spider=SmzdmSpider)
                        title_sel_list = sel.select(title_xpath)
                        if len(title_sel_list):
                            title = self.normalize_text(title_sel_list[0].extract())
                        else:
                            log.msg("Shopping page error:\ttitle is not found", level=log.ERROR, spider=SmzdmSpider)
                            raise StopIteration
                            continue
                        price_sel_list = sel.select(price_xpath)
                        if len(price_sel_list):
                            price_text = price_sel_list[0].extract()
                            price_text = price_redudant_pattern.sub('', price_text)
                            try:
                                price = float(price_text)
                                if url.startswith("http://www.kiddies24.de"):
                                    price /= 100
                            except:
                                log.msg("Shopping page error:\tThis item is sold out, the price is %s" % (price), level=log.WARNING, spider=SmzdmSpider)
                        else:
                            log.msg("Shopping page error:\tprice is not found", level=log.WARNING, spider=SmzdmSpider)
                        title_img_sel_list = []
                        for title_img_xpath in title_img_xpath_list:
                            title_img_sel_list += sel.select(title_img_xpath)
                        title_img_src = ""
                        for title_img_sel in title_img_sel_list:
                            title_img_src = title_img_sel.extract()
                            if title_img_src:
                                img_src_list.append(title_img_src)
                                break
                        # if url_pattern.match('http://www.amazon.'):
                        #     try:
                        #         WebDriverWait(response.webdriver, 10) \
                        #             .until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, '//iframe[@id="product-description-iframe"]')))
                        #     except:
                        #         log.msg("Shopping page error:\tFrame in Amazon is not found", level=log.ERROR, spider=SmzdmSpider)
                        #
                        # description_sel_list = sel.select(description_xpath + "/*")
                        # for description_sel in description_sel_list:
                        #     description_part = self.normalize_text(description_sel.extract())
                        #     if description_part:
                        #         description += description_part + '\t'

                        # description_img_sel_list = sel.select(description_img_xpath)
                        # """ Run func with the given timeout. If func didn't finish running
                        #     within the timeout, raise TimeLimitExpired
                        # """
                        # import threading
                        # class GetImgSrcThread(threading.Thread):
                        #     def __init__(self, driver, sel_list):
                        #         threading.Thread.__init__(self)
                        #         self.__driver = driver
                        #         self.__sel_list = sel_list
                        #     def run(self):
                        #         for sel in self.__sel_list:
                        #             try:
                        #                 self.__driver.execute_script("arguments[0].scrollIntoView(true);", sel.element)
                        #                 time.sleep(1)
                        #             except:
                        #                 log.msg("Shopping page error:\tscrollIntoView failed", level=log.ERROR, spider=SmzdmSpider)
                        #                 img_src_sel_list = sel.select("./@src")
                        #                 for img_src_sel in img_src_sel_list:
                        #                     log.msg("Shopping page error:\timage %s is not found" % (img_src_sel.extract()), level=log.ERROR, spider=SmzdmSpider)
                        #                 continue
                        # it = GetImgSrcThread(response.webdriver, description_img_sel_list)
                        # it.start()
                        # it.join(60)
                        # if it.isAlive():
                        #     break
                        # description_img_sel_list = sel.select(description_img_xpath + "/@src")
                        # log.msg("Shopping description img list: %s[%d]" % (description_img_sel_list, len(description_img_sel_list)) , level=log.DEBUG, spider=SmzdmSpider)
                        # for description_img_sel in description_img_sel_list:
                        #     img_src = description_img_sel.extract()
                        #     if img_src:
                        #         img_src_list.append(img_src)
                        log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" % (title, description, price, url, referer) , level=log.DEBUG, spider=SmzdmSpider)
                        yield items.ShoppingItem(title=title, price=price, url=url, referer=referer, image_urls=img_src_list, title_image_url=title_img_src, description=description, currency=currency)
        except:
            log.msg("Shopping item page parse failed:\t[%s]" % (response.url) , level=log.ERROR, spider=SmzdmSpider)
            raise StopIteration

コード例 #12

0

ファイルを表示

    def parse_shopping_item_page(self, response):
        try:
            sel = WebdriverXPathSelector(response)
            referer = response.meta["referer"]
            target_price = response.meta["target_price"]
            jd_jump_url_sel = sel.select(
                "/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href")
            if jd_jump_url_sel:
                log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                yield WebdriverRequest(jd_jump_url_sel[0].extract(),
                                       meta={'referer': referer},
                                       callback=self.parse_shopping_item_page)
            else:
                img_src_list = []
                comment_list = []
                description = ""
                title = ""
                vote_count = ""
                vote_score = ""
                price = -1.0
                log.msg("Shopping url: %s" % (response.url),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                log.msg("Real shopping url: %s" %
                        (response.webdriver.current_url),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                url = response.webdriver.current_url
                hostname = urlparse(url).hostname
                if hostname != "www.amazon.cn":
                    log.msg("Shopping robot does not support this site",
                            level=log.INFO,
                            spider=SmzdmSpider)
                    return
                for url_pattern, (title_xpath, price_xpath,
                                  price_redudant_pattern, description_xpath,
                                  description_img_xpath, currency,
                                  title_img_xpath_list, comment_xpath,
                                  vote_count_xpath, vote_score_xpath
                                  ) in self.__url_pattern_xpath_dict.items():
                    if url_pattern.match(url):
                        log.msg("Shopping url pattern is found",
                                level=log.DEBUG,
                                spider=SmzdmSpider)
                        title_sel_list = sel.select(title_xpath)
                        if len(title_sel_list):
                            title = self.normalize_text(
                                title_sel_list[0].extract())
                        else:
                            log.msg("Shopping page error:\ttitle is not found",
                                    level=log.ERROR,
                                    spider=SmzdmSpider)
                            raise StopIteration
                            continue
                        price_sel_list = sel.select(price_xpath)
                        if len(price_sel_list):
                            price_text = price_sel_list[0].extract()
                            price_text = price_redudant_pattern.sub(
                                '', price_text)
                            try:
                                price = float(price_text)
                                if url.startswith("http://www.kiddies24.de"):
                                    price /= 100
                                if (price -
                                        target_price) / target_price > 0.05:
                                    log.msg(
                                        "Price is not ideal. (current price: %f, target price: %f)"
                                        % (price, target_price),
                                        level=log.INFO,
                                        spider=SmzdmSpider)
                                    return
                            except:
                                traceback.print_exc()
                                log.msg(
                                    "Shopping page error:\tThis item is sold out, the price is %s"
                                    % (price),
                                    level=log.WARNING,
                                    spider=SmzdmSpider)
                        else:
                            log.msg("Shopping page error:\tprice is not found",
                                    level=log.WARNING,
                                    spider=SmzdmSpider)
                        title_img_sel_list = []
                        for title_img_xpath in title_img_xpath_list:
                            title_img_sel_list += sel.select(title_img_xpath)
                        title_img_src = ""
                        for title_img_sel in title_img_sel_list:
                            title_img_src = title_img_sel.extract()
                            if title_img_src:
                                img_src_list.append(title_img_src)
                                break
                        if hostname == "item.jd.com":
                            # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-0']"))
                            # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-2']"))
                            # sel.webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                            sel.webdriver.find_element_by_xpath(
                                "//li[@id='detail-tab-comm']/a").click()
                            time.sleep(2)
                        for comment_sel in sel.select(comment_xpath):
                            comment_list.append(comment_sel.extract())
                        vote_count_sel_list = sel.select(vote_count_xpath)
                        if len(vote_count_sel_list):
                            vote_count = vote_count_sel_list[0].extract()
                        else:
                            log.msg(
                                "Shopping page error:\tvote count is not found",
                                level=log.ERROR,
                                spider=SmzdmSpider)
                        vote_score_sel_list = sel.select(vote_score_xpath)
                        if len(vote_score_sel_list):
                            vote_score = vote_score_sel_list[0].extract()
                        else:
                            log.msg(
                                "Shopping page error:\tvote score is not found",
                                level=log.ERROR,
                                spider=SmzdmSpider)
                        log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" %
                                (title, description, price, url, referer),
                                level=log.DEBUG,
                                spider=SmzdmSpider)
                        yield items.ShoppingItem(title=title, price=price, url=url, referer=referer, image_urls=img_src_list, \
                                title_image_url=title_img_src, description=description, currency=currency, \
                                comment_list=comment_list, vote_count=vote_count, vote_score=vote_score)
                        log.msg("Place the order!",
                                level=log.INFO,
                                spider=SmzdmSpider)

                        sel = WebdriverXPathSelector(response)
                        one_click_button_list = sel.select(
                            '//*[@id="one-click-button"]')
                        if not one_click_button_list:
                            log.msg("Need to enable one click order!",
                                    level=log.DEBUG,
                                    spider=SmzdmSpider)
                            referer = response.meta["referer"]
                            enable_one_click_url_sel = response.webdriver.find_elements_by_xpath(
                                '//*[@id="oneClickSignIn"]')
                            if enable_one_click_url_sel:
                                # enable_one_click_url = enable_one_click_url_sel[0].extract()
                                log.msg("Enable one click order",
                                        level=log.DEBUG,
                                        spider=SmzdmSpider)
                                # enable_one_click_url_sel[0].click()
                                yield WebdriverActionRequest(response, \
                                        actions=ActionChains(response.webdriver).click(enable_one_click_url_sel[0]), \
                                        meta={'referer': referer}, \
                                        callback=self.parse_shopping_item_page)
                        else:
                            log.msg("One click order!",
                                    level=log.INFO,
                                    spider=SmzdmSpider)
                            one_click_button_list[0].click()

                        # self.order_item(response)
                        # time.sleep(1)
        except:
            traceback.print_exc()
            log.msg("Shopping item page parse failed:\t[%s]" % (response.url),
                    level=log.ERROR,
                    spider=SmzdmSpider)
            raise StopIteration