def parse_smzdm_list_page(self, response): try: category = response.meta["category"] sel = WebdriverXPathSelector(response) item_url_sel_list = sel.select( "/html/body/section//div[@class='listTitle']/h3[@class='itemName']/a/@href" ) for item_url_sel in item_url_sel_list: item_url = item_url_sel.extract() if item_url not in self.urls_seen: yield WebdriverRequest(item_url, meta={'category': category}, callback=self.parse_smzdm_item_page) # else: # raise StopIteration next_page_xpath = "//li[@class='pagedown']/a/@href" next_page_url_sel_list = sel.select(next_page_xpath) for next_page_url_sel in next_page_url_sel_list: next_page_url = next_page_url_sel.extract() yield WebdriverRequest(next_page_url, meta={'category': category}, callback=self.parse_smzdm_list_page) except: log.msg("Smzdm list page parse failed:\t[%s]" % (response.url), level=log.ERROR, spider=SmzdmSpider) raise StopIteration
def parse_smzdm_item_page(self, response): try: category = response.meta["category"] sel = WebdriverXPathSelector(response) title_sel_list = sel.select('/html/body/section/div[1]/article/h1') attachment_sel_list = sel.select('/html/body/section/div[1]/article/h1/span') if len(title_sel_list): title = self.normalize_text(title_sel_list[0].extract()) item_name = title else: log.msg("Smzdm title parse failed:\t[%s]" % (response.url) , level=log.ERROR, spider=SmzdmSpider) raise StopIteration all_attachment = '' for attachment_sel in attachment_sel_list: attachment = attachment_sel.extract() item_name = item_name.replace(attachment, '') all_attachment += attachment price, currency = self.parse_price(all_attachment) item_shopping_url_sel_list = sel.select("/html/body/section/div[1]/article/div[2]/div/div/a/@href") if len(item_shopping_url_sel_list): item_shopping_url = item_shopping_url_sel_list[0].extract() yield WebdriverRequest(item_shopping_url, meta={'referer': response.url}, callback=self.parse_shopping_item_page) description_sel_list = sel.select('/html/body/section/div[1]/article/div[2]/p[@itemprop="description"]') description = '' img_src_list = [] for description_sel in description_sel_list: description += self.normalize_text(description_sel.extract()) img_src_sel_list = description_sel.select(".//img/@src") for img_src_sel in img_src_sel_list: img_src_list.append(img_src_sel.extract()) try: worthy_vote = int(self.get_text_by_xpath(sel, "//span[@id='rating_worthy_num']/text()")) except: worthy_vote = 0 try: unworthy_vote = int(self.get_text_by_xpath(sel, "//span[@id='rating_unworthy_num']/text()")) except: unworthy_vote = 0 try: favorite_count = int(self.get_text_by_xpath(sel, "//a[@class='fav']/em/text()")) except: favorite_count = 0 try: comment_count = int(self.get_text_by_xpath(sel, "//a[@class='comment']/em/text()")) except: comment_count = 0 yield items.SmzdmItem(title=item_name, price=price, url=response.url, description=description, \ image_urls=img_src_list, worthy_vote=worthy_vote, unworthy_vote=unworthy_vote, \ favorite_count=favorite_count, comment_count=comment_count, category=category, currency=currency) except: log.msg("Smzdm item page parse failed:\t[%s]" % (response.url) , level=log.ERROR, spider=SmzdmSpider) raise StopIteration
def submit_login_info(self, response): sel = WebdriverXPathSelector(response) sel.select('//*[@id="ap_email"]')[0].send_keys(self.formdata["email"]) sel.select('//*[@id="ap_password"]')[0].send_keys( self.formdata["password"]) sel.select('//*[@id="signInSubmit"]')[0].click() self.check_login(response) time.sleep(1)
def order_item(self, response): sel = WebdriverXPathSelector(response) one_click_button_list = sel.select('//*[@id="one-click-button"]') if not one_click_button_list: log.msg("Need to enable one click order!", level=log.DEBUG, spider=SmzdmSpider) referer = response.meta["referer"] enable_one_click_url_sel = sel.select('//*[@id="oneClickSignIn"]/a/span') if enable_one_click_url_sel: # enable_one_click_url = enable_one_click_url_sel[0].extract() log.msg("Enable one click order", level=log.DEBUG, spider=SmzdmSpider) enable_one_click_url_sel[0].click() WebdriverActionRequest(response, \ actions=ActionChains(response.webdriver).click(enable_one_click_url_sel[0]), \ callback=self.parse_shopping_item_page) else: log.msg("One click order!", level=log.INFO, spider=SmzdmSpider) one_click_button_list[0].click()
def submit_login_info(self, response): sel = WebdriverXPathSelector(response) sel.select('//*[@id="ap_email"]')[0].send_keys(self.formdata["email"]) sel.select('//*[@id="ap_password"]')[0].send_keys(self.formdata["password"]) sel.select('//*[@id="signInSubmit"]')[0].click() self.check_login(response) time.sleep(1)
def parse_smzdm_list_page(self, response): try: category = response.meta["category"] sel = WebdriverXPathSelector(response) item_url_sel_list = sel.select("/html/body/section//div[@class='listTitle']/h3[@class='itemName']/a/@href") for item_url_sel in item_url_sel_list: item_url = item_url_sel.extract() if item_url not in self.urls_seen: yield WebdriverRequest(item_url, meta={'category': category}, callback=self.parse_smzdm_item_page) # else: # raise StopIteration next_page_xpath = "//li[@class='pagedown']/a/@href" next_page_url_sel_list = sel.select(next_page_xpath) for next_page_url_sel in next_page_url_sel_list: next_page_url = next_page_url_sel.extract() yield WebdriverRequest(next_page_url, meta={'category': category}, callback=self.parse_smzdm_list_page) except: log.msg("Smzdm list page parse failed:\t[%s]" % (response.url) , level=log.ERROR, spider=SmzdmSpider) raise StopIteration
def order_item(self, response): sel = WebdriverXPathSelector(response) one_click_button_list = sel.select('//*[@id="one-click-button"]') if not one_click_button_list: log.msg("Need to enable one click order!", level=log.DEBUG, spider=SmzdmSpider) referer = response.meta["referer"] enable_one_click_url_sel = sel.select( '//*[@id="oneClickSignIn"]/a/span') if enable_one_click_url_sel: # enable_one_click_url = enable_one_click_url_sel[0].extract() log.msg("Enable one click order", level=log.DEBUG, spider=SmzdmSpider) enable_one_click_url_sel[0].click() WebdriverActionRequest(response, \ actions=ActionChains(response.webdriver).click(enable_one_click_url_sel[0]), \ callback=self.parse_shopping_item_page) else: log.msg("One click order!", level=log.INFO, spider=SmzdmSpider) one_click_button_list[0].click()
def parse_shopping_item_page(self, response): try: sel = WebdriverXPathSelector(response) referer = response.meta["referer"] jd_jump_url_sel = sel.select( "/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href") if jd_jump_url_sel: log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()), level=log.DEBUG, spider=SmzdmSpider) yield WebdriverRequest(jd_jump_url_sel[0].extract(), meta={'referer': referer}, callback=self.parse_shopping_item_page) else: img_src_list = [] description = "" title = "" price = -1.0 log.msg("Shopping url: %s" % (response.url), level=log.DEBUG, spider=SmzdmSpider) log.msg("Real shopping url: %s" % (response.webdriver.current_url), level=log.DEBUG, spider=SmzdmSpider) url = response.webdriver.current_url for url_pattern, (title_xpath, price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list ) in self.__url_pattern_xpath_dict.items(): if url_pattern.match(url): log.msg("Shopping url pattern is found", level=log.DEBUG, spider=SmzdmSpider) title_sel_list = sel.select(title_xpath) if len(title_sel_list): title = self.normalize_text( title_sel_list[0].extract()) else: log.msg("Shopping page error:\ttitle is not found", level=log.ERROR, spider=SmzdmSpider) raise StopIteration continue price_sel_list = sel.select(price_xpath) if len(price_sel_list): price_text = price_sel_list[0].extract() price_text = price_redudant_pattern.sub( '', price_text) try: price = float(price_text) if url.startswith("http://www.kiddies24.de"): price /= 100 except: log.msg( "Shopping page error:\tThis item is sold out, the price is %s" % (price), level=log.WARNING, spider=SmzdmSpider) else: log.msg("Shopping page error:\tprice is not found", level=log.WARNING, spider=SmzdmSpider) title_img_sel_list = [] for title_img_xpath in title_img_xpath_list: title_img_sel_list += sel.select(title_img_xpath) title_img_src = "" for title_img_sel in title_img_sel_list: title_img_src = title_img_sel.extract() if title_img_src: img_src_list.append(title_img_src) break # if url_pattern.match('http://www.amazon.'): # try: # WebDriverWait(response.webdriver, 10) \ # .until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, '//iframe[@id="product-description-iframe"]'))) # except: # log.msg("Shopping page error:\tFrame in Amazon is not found", level=log.ERROR, spider=SmzdmSpider) # # description_sel_list = sel.select(description_xpath + "/*") # for description_sel in description_sel_list: # description_part = self.normalize_text(description_sel.extract()) # if description_part: # description += description_part + '\t' # description_img_sel_list = sel.select(description_img_xpath) # """ Run func with the given timeout. If func didn't finish running # within the timeout, raise TimeLimitExpired # """ # import threading # class GetImgSrcThread(threading.Thread): # def __init__(self, driver, sel_list): # threading.Thread.__init__(self) # self.__driver = driver # self.__sel_list = sel_list # def run(self): # for sel in self.__sel_list: # try: # self.__driver.execute_script("arguments[0].scrollIntoView(true);", sel.element) # time.sleep(1) # except: # log.msg("Shopping page error:\tscrollIntoView failed", level=log.ERROR, spider=SmzdmSpider) # img_src_sel_list = sel.select("./@src") # for img_src_sel in img_src_sel_list: # log.msg("Shopping page error:\timage %s is not found" % (img_src_sel.extract()), level=log.ERROR, spider=SmzdmSpider) # continue # it = GetImgSrcThread(response.webdriver, description_img_sel_list) # it.start() # it.join(60) # if it.isAlive(): # break # description_img_sel_list = sel.select(description_img_xpath + "/@src") # log.msg("Shopping description img list: %s[%d]" % (description_img_sel_list, len(description_img_sel_list)) , level=log.DEBUG, spider=SmzdmSpider) # for description_img_sel in description_img_sel_list: # img_src = description_img_sel.extract() # if img_src: # img_src_list.append(img_src) log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" % (title, description, price, url, referer), level=log.DEBUG, spider=SmzdmSpider) yield items.ShoppingItem(title=title, price=price, url=url, referer=referer, image_urls=img_src_list, title_image_url=title_img_src, description=description, currency=currency) except: log.msg("Shopping item page parse failed:\t[%s]" % (response.url), level=log.ERROR, spider=SmzdmSpider) raise StopIteration
def parse_smzdm_item_page(self, response): try: category = response.meta["category"] sel = WebdriverXPathSelector(response) title_sel_list = sel.select('/html/body/section/div[1]/article/h1') attachment_sel_list = sel.select( '/html/body/section/div[1]/article/h1/span') if len(title_sel_list): title = self.normalize_text(title_sel_list[0].extract()) item_name = title else: log.msg("Smzdm title parse failed:\t[%s]" % (response.url), level=log.ERROR, spider=SmzdmSpider) raise StopIteration all_attachment = '' for attachment_sel in attachment_sel_list: attachment = attachment_sel.extract() item_name = item_name.replace(attachment, '') all_attachment += attachment price, currency = self.parse_price(all_attachment) item_shopping_url_sel_list = sel.select( "/html/body/section/div[1]/article/div[2]/div/div/a/@href") if len(item_shopping_url_sel_list): item_shopping_url = item_shopping_url_sel_list[0].extract() yield WebdriverRequest(item_shopping_url, meta={'referer': response.url}, callback=self.parse_shopping_item_page) description_sel_list = sel.select( '/html/body/section/div[1]/article/div[2]/p[@itemprop="description"]' ) description = '' img_src_list = [] for description_sel in description_sel_list: description += self.normalize_text(description_sel.extract()) img_src_sel_list = description_sel.select(".//img/@src") for img_src_sel in img_src_sel_list: img_src_list.append(img_src_sel.extract()) try: worthy_vote = int( self.get_text_by_xpath( sel, "//span[@id='rating_worthy_num']/text()")) except: worthy_vote = 0 try: unworthy_vote = int( self.get_text_by_xpath( sel, "//span[@id='rating_unworthy_num']/text()")) except: unworthy_vote = 0 try: favorite_count = int( self.get_text_by_xpath(sel, "//a[@class='fav']/em/text()")) except: favorite_count = 0 try: comment_count = int( self.get_text_by_xpath(sel, "//a[@class='comment']/em/text()")) except: comment_count = 0 yield items.SmzdmItem(title=item_name, price=price, url=response.url, description=description, \ image_urls=img_src_list, worthy_vote=worthy_vote, unworthy_vote=unworthy_vote, \ favorite_count=favorite_count, comment_count=comment_count, category=category, currency=currency) except: log.msg("Smzdm item page parse failed:\t[%s]" % (response.url), level=log.ERROR, spider=SmzdmSpider) raise StopIteration
def parse_shopping_item_page(self, response): try: sel = WebdriverXPathSelector(response) referer = response.meta["referer"] target_price = response.meta["target_price"] jd_jump_url_sel = sel.select("/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href") if jd_jump_url_sel: log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()) , level=log.DEBUG, spider=SmzdmSpider) yield WebdriverRequest(jd_jump_url_sel[0].extract(), meta={'referer': referer}, callback=self.parse_shopping_item_page) else: img_src_list = [] comment_list = [] description = "" title = "" vote_count = "" vote_score = "" price = -1.0 log.msg("Shopping url: %s" % (response.url), level=log.DEBUG, spider=SmzdmSpider) log.msg("Real shopping url: %s" % (response.webdriver.current_url), level=log.DEBUG, spider=SmzdmSpider) url = response.webdriver.current_url hostname = urlparse(url).hostname if hostname != "www.amazon.cn": log.msg("Shopping robot does not support this site", level=log.INFO, spider=SmzdmSpider) return for url_pattern, (title_xpath, price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list, comment_xpath, vote_count_xpath, vote_score_xpath) in self.__url_pattern_xpath_dict.items(): if url_pattern.match(url): log.msg("Shopping url pattern is found", level=log.DEBUG, spider=SmzdmSpider) title_sel_list = sel.select(title_xpath) if len(title_sel_list): title = self.normalize_text(title_sel_list[0].extract()) else: log.msg("Shopping page error:\ttitle is not found", level=log.ERROR, spider=SmzdmSpider) raise StopIteration continue price_sel_list = sel.select(price_xpath) if len(price_sel_list): price_text = price_sel_list[0].extract() price_text = price_redudant_pattern.sub('', price_text) try: price = float(price_text) if url.startswith("http://www.kiddies24.de"): price /= 100 if (price - target_price) / target_price > 0.05: log.msg("Price is not ideal. (current price: %f, target price: %f)" % (price, target_price), level=log.INFO, spider=SmzdmSpider) return except: traceback.print_exc() log.msg("Shopping page error:\tThis item is sold out, the price is %s" % (price), level=log.WARNING, spider=SmzdmSpider) else: log.msg("Shopping page error:\tprice is not found", level=log.WARNING, spider=SmzdmSpider) title_img_sel_list = [] for title_img_xpath in title_img_xpath_list: title_img_sel_list += sel.select(title_img_xpath) title_img_src = "" for title_img_sel in title_img_sel_list: title_img_src = title_img_sel.extract() if title_img_src: img_src_list.append(title_img_src) break if hostname == "item.jd.com": # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-0']")) # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-2']")) # sel.webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);") sel.webdriver.find_element_by_xpath("//li[@id='detail-tab-comm']/a").click() time.sleep(2) for comment_sel in sel.select(comment_xpath): comment_list.append(comment_sel.extract()) vote_count_sel_list = sel.select(vote_count_xpath) if len(vote_count_sel_list): vote_count = vote_count_sel_list[0].extract() else: log.msg("Shopping page error:\tvote count is not found", level=log.ERROR, spider=SmzdmSpider) vote_score_sel_list = sel.select(vote_score_xpath) if len(vote_score_sel_list): vote_score = vote_score_sel_list[0].extract() else: log.msg("Shopping page error:\tvote score is not found", level=log.ERROR, spider=SmzdmSpider) log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" % (title, description, price, url, referer) , level=log.DEBUG, spider=SmzdmSpider) yield items.ShoppingItem(title=title, price=price, url=url, referer=referer, image_urls=img_src_list, \ title_image_url=title_img_src, description=description, currency=currency, \ comment_list=comment_list, vote_count=vote_count, vote_score=vote_score) log.msg("Place the order!", level=log.INFO, spider=SmzdmSpider) sel = WebdriverXPathSelector(response) one_click_button_list = sel.select('//*[@id="one-click-button"]') if not one_click_button_list: log.msg("Need to enable one click order!", level=log.DEBUG, spider=SmzdmSpider) referer = response.meta["referer"] enable_one_click_url_sel = response.webdriver.find_elements_by_xpath('//*[@id="oneClickSignIn"]') if enable_one_click_url_sel: # enable_one_click_url = enable_one_click_url_sel[0].extract() log.msg("Enable one click order", level=log.DEBUG, spider=SmzdmSpider) # enable_one_click_url_sel[0].click() yield WebdriverActionRequest(response, \ actions=ActionChains(response.webdriver).click(enable_one_click_url_sel[0]), \ meta={'referer': referer}, \ callback=self.parse_shopping_item_page) else: log.msg("One click order!", level=log.INFO, spider=SmzdmSpider) one_click_button_list[0].click() # self.order_item(response) # time.sleep(1) except: traceback.print_exc() log.msg("Shopping item page parse failed:\t[%s]" % (response.url) , level=log.ERROR, spider=SmzdmSpider) raise StopIteration
def parse_shopping_item_page(self, response): try: sel = WebdriverXPathSelector(response) referer = response.meta["referer"] jd_jump_url_sel = sel.select("/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href") if jd_jump_url_sel: log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()) , level=log.DEBUG, spider=SmzdmSpider) yield WebdriverRequest(jd_jump_url_sel[0].extract(), meta={'referer': referer}, callback=self.parse_shopping_item_page) else: img_src_list = [] description = "" title = "" price = -1.0 log.msg("Shopping url: %s" % (response.url), level=log.DEBUG, spider=SmzdmSpider) log.msg("Real shopping url: %s" % (response.webdriver.current_url), level=log.DEBUG, spider=SmzdmSpider) url = response.webdriver.current_url for url_pattern, (title_xpath, price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list) in self.__url_pattern_xpath_dict.items(): if url_pattern.match(url): log.msg("Shopping url pattern is found", level=log.DEBUG, spider=SmzdmSpider) title_sel_list = sel.select(title_xpath) if len(title_sel_list): title = self.normalize_text(title_sel_list[0].extract()) else: log.msg("Shopping page error:\ttitle is not found", level=log.ERROR, spider=SmzdmSpider) raise StopIteration continue price_sel_list = sel.select(price_xpath) if len(price_sel_list): price_text = price_sel_list[0].extract() price_text = price_redudant_pattern.sub('', price_text) try: price = float(price_text) if url.startswith("http://www.kiddies24.de"): price /= 100 except: log.msg("Shopping page error:\tThis item is sold out, the price is %s" % (price), level=log.WARNING, spider=SmzdmSpider) else: log.msg("Shopping page error:\tprice is not found", level=log.WARNING, spider=SmzdmSpider) title_img_sel_list = [] for title_img_xpath in title_img_xpath_list: title_img_sel_list += sel.select(title_img_xpath) title_img_src = "" for title_img_sel in title_img_sel_list: title_img_src = title_img_sel.extract() if title_img_src: img_src_list.append(title_img_src) break # if url_pattern.match('http://www.amazon.'): # try: # WebDriverWait(response.webdriver, 10) \ # .until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, '//iframe[@id="product-description-iframe"]'))) # except: # log.msg("Shopping page error:\tFrame in Amazon is not found", level=log.ERROR, spider=SmzdmSpider) # # description_sel_list = sel.select(description_xpath + "/*") # for description_sel in description_sel_list: # description_part = self.normalize_text(description_sel.extract()) # if description_part: # description += description_part + '\t' # description_img_sel_list = sel.select(description_img_xpath) # """ Run func with the given timeout. If func didn't finish running # within the timeout, raise TimeLimitExpired # """ # import threading # class GetImgSrcThread(threading.Thread): # def __init__(self, driver, sel_list): # threading.Thread.__init__(self) # self.__driver = driver # self.__sel_list = sel_list # def run(self): # for sel in self.__sel_list: # try: # self.__driver.execute_script("arguments[0].scrollIntoView(true);", sel.element) # time.sleep(1) # except: # log.msg("Shopping page error:\tscrollIntoView failed", level=log.ERROR, spider=SmzdmSpider) # img_src_sel_list = sel.select("./@src") # for img_src_sel in img_src_sel_list: # log.msg("Shopping page error:\timage %s is not found" % (img_src_sel.extract()), level=log.ERROR, spider=SmzdmSpider) # continue # it = GetImgSrcThread(response.webdriver, description_img_sel_list) # it.start() # it.join(60) # if it.isAlive(): # break # description_img_sel_list = sel.select(description_img_xpath + "/@src") # log.msg("Shopping description img list: %s[%d]" % (description_img_sel_list, len(description_img_sel_list)) , level=log.DEBUG, spider=SmzdmSpider) # for description_img_sel in description_img_sel_list: # img_src = description_img_sel.extract() # if img_src: # img_src_list.append(img_src) log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" % (title, description, price, url, referer) , level=log.DEBUG, spider=SmzdmSpider) yield items.ShoppingItem(title=title, price=price, url=url, referer=referer, image_urls=img_src_list, title_image_url=title_img_src, description=description, currency=currency) except: log.msg("Shopping item page parse failed:\t[%s]" % (response.url) , level=log.ERROR, spider=SmzdmSpider) raise StopIteration
def parse_shopping_item_page(self, response): try: sel = WebdriverXPathSelector(response) referer = response.meta["referer"] target_price = response.meta["target_price"] jd_jump_url_sel = sel.select( "/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href") if jd_jump_url_sel: log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()), level=log.DEBUG, spider=SmzdmSpider) yield WebdriverRequest(jd_jump_url_sel[0].extract(), meta={'referer': referer}, callback=self.parse_shopping_item_page) else: img_src_list = [] comment_list = [] description = "" title = "" vote_count = "" vote_score = "" price = -1.0 log.msg("Shopping url: %s" % (response.url), level=log.DEBUG, spider=SmzdmSpider) log.msg("Real shopping url: %s" % (response.webdriver.current_url), level=log.DEBUG, spider=SmzdmSpider) url = response.webdriver.current_url hostname = urlparse(url).hostname if hostname != "www.amazon.cn": log.msg("Shopping robot does not support this site", level=log.INFO, spider=SmzdmSpider) return for url_pattern, (title_xpath, price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list, comment_xpath, vote_count_xpath, vote_score_xpath ) in self.__url_pattern_xpath_dict.items(): if url_pattern.match(url): log.msg("Shopping url pattern is found", level=log.DEBUG, spider=SmzdmSpider) title_sel_list = sel.select(title_xpath) if len(title_sel_list): title = self.normalize_text( title_sel_list[0].extract()) else: log.msg("Shopping page error:\ttitle is not found", level=log.ERROR, spider=SmzdmSpider) raise StopIteration continue price_sel_list = sel.select(price_xpath) if len(price_sel_list): price_text = price_sel_list[0].extract() price_text = price_redudant_pattern.sub( '', price_text) try: price = float(price_text) if url.startswith("http://www.kiddies24.de"): price /= 100 if (price - target_price) / target_price > 0.05: log.msg( "Price is not ideal. (current price: %f, target price: %f)" % (price, target_price), level=log.INFO, spider=SmzdmSpider) return except: traceback.print_exc() log.msg( "Shopping page error:\tThis item is sold out, the price is %s" % (price), level=log.WARNING, spider=SmzdmSpider) else: log.msg("Shopping page error:\tprice is not found", level=log.WARNING, spider=SmzdmSpider) title_img_sel_list = [] for title_img_xpath in title_img_xpath_list: title_img_sel_list += sel.select(title_img_xpath) title_img_src = "" for title_img_sel in title_img_sel_list: title_img_src = title_img_sel.extract() if title_img_src: img_src_list.append(title_img_src) break if hostname == "item.jd.com": # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-0']")) # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-2']")) # sel.webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);") sel.webdriver.find_element_by_xpath( "//li[@id='detail-tab-comm']/a").click() time.sleep(2) for comment_sel in sel.select(comment_xpath): comment_list.append(comment_sel.extract()) vote_count_sel_list = sel.select(vote_count_xpath) if len(vote_count_sel_list): vote_count = vote_count_sel_list[0].extract() else: log.msg( "Shopping page error:\tvote count is not found", level=log.ERROR, spider=SmzdmSpider) vote_score_sel_list = sel.select(vote_score_xpath) if len(vote_score_sel_list): vote_score = vote_score_sel_list[0].extract() else: log.msg( "Shopping page error:\tvote score is not found", level=log.ERROR, spider=SmzdmSpider) log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" % (title, description, price, url, referer), level=log.DEBUG, spider=SmzdmSpider) yield items.ShoppingItem(title=title, price=price, url=url, referer=referer, image_urls=img_src_list, \ title_image_url=title_img_src, description=description, currency=currency, \ comment_list=comment_list, vote_count=vote_count, vote_score=vote_score) log.msg("Place the order!", level=log.INFO, spider=SmzdmSpider) sel = WebdriverXPathSelector(response) one_click_button_list = sel.select( '//*[@id="one-click-button"]') if not one_click_button_list: log.msg("Need to enable one click order!", level=log.DEBUG, spider=SmzdmSpider) referer = response.meta["referer"] enable_one_click_url_sel = response.webdriver.find_elements_by_xpath( '//*[@id="oneClickSignIn"]') if enable_one_click_url_sel: # enable_one_click_url = enable_one_click_url_sel[0].extract() log.msg("Enable one click order", level=log.DEBUG, spider=SmzdmSpider) # enable_one_click_url_sel[0].click() yield WebdriverActionRequest(response, \ actions=ActionChains(response.webdriver).click(enable_one_click_url_sel[0]), \ meta={'referer': referer}, \ callback=self.parse_shopping_item_page) else: log.msg("One click order!", level=log.INFO, spider=SmzdmSpider) one_click_button_list[0].click() # self.order_item(response) # time.sleep(1) except: traceback.print_exc() log.msg("Shopping item page parse failed:\t[%s]" % (response.url), level=log.ERROR, spider=SmzdmSpider) raise StopIteration