class Model_Scraper_Keywords_First(Model_Scraper_Standard): def __init__(self, region): super(Model_Scraper_Keywords_First, self).__init__(region) self.region = region self.processor = Service_Functions().getProcessor( 'Keywords_First', region) def scraper(self, keywords): self.process = Model_Scraper_Standard(self.region) url = "https://www.amazon." + self.region + "/gp/search?keywords=" + keywords + "&page=1" print(url) try: content = self.process.processkeywords(url) except Exception as err: print(err) try: if (content): # 这边写解析代码 result = self.processor.process(content) if (result): return result elif (content == None): return None else: return False except: return False
class Model_Scraper_Product_Base(Model_Scraper_Standard): def __init__(self, region): super(Model_Scraper_Product_Base, self).__init__(region) self.region = region self.processor = Service_Functions().getProcessor( 'Product_Base', region) def scrape(self, asin): self.process = Model_Scraper_Standard(self.region) url = "https://www.amazon." + self.region + "/dp/" + asin + "?th=1&psc=1" print(url) try: content = self.process.process(url) except Exception as err: print(err) try: if (content): # 解析代码 data = self.processor.process(content.encode('utf-8')) if (data): return data else: return False elif (content == None): return None else: return False except: return False
class Model_Scraper_TopReviewer(Model_Scraper_Standard): def __init__(self, region): self.region = region self.processor = Service_Functions().getProcessor( 'TopReviewer', region) def scrape(self, begin, end): if not str(begin).isdigit() and not str(end).isdigit() and begin > end: return Model_Static_Scrape_Status.FAILED self.process = Model_Scraper_Standard(self.region) data = [] for i in range(begin, end): pageUrl = "https://www.amazon." + self.region + "/review/top-reviewers?page=" + str( i) pageContent = self.process.processTopReviewer(pageUrl) if not pageContent: continue rankEnd = i * 10 rankBegin = rankEnd - 9 pageResult = self.processor.process(pageContent, rankBegin, rankEnd + 1) if (pageResult): # 数组合并 data.append(pageResult) if (len(data)): return data
class Model_Scraper_Product_Offer1(Model_Scraper_Standard): def __init__(self, region): self.region = region self.processor = Service_Functions().getProcessor( 'Product_Offer', region) def process(self, asin): self.processOffer = Model_Scraper_Standard(self.region) content = self.processOffer.processOffer(self.region, asin) if (content): return content # def scrapeInventory(self, data): # if (data == '' or data == None): # return Model_Static_Scrape_Status.FAILED # url ="http://www.amazon."+self.region+"/gp/aws/cart/add.html" # fields = [] # session_id = None def scrape(self, asin): content = self.process(asin) if (content): # 这边写解析代码, 通过解析返回的数据再进行库存的抓取 print(content) data = self.processor.process(content.encode('utf-8')) if (data): print(data) # 通过解析得到的数据进行库存的计算 # Inventory = self.scrapeInventory(asin, data) # print (Inventory) pageCount = self.processor.getPageCount(content) # print (pageCount) if (pageCount > 1): for i in range(2, int(pageCount) + 1): # print (i) index = str((i - 1) * 10) pageUrl = "http://www.amazon." + "com" + "/gp/offer-listing/" + asin + "/ref=olpOffersSuppressed?ie=UTF8&f_new=true&overridePriceSuppression=1&startIndex=" + index # print (pageUrl) pageContent = self.processPageOffer(pageUrl) if (pageContent): print(pageContent) pageResult = self.processor.process( pageContent.encode('utf-8')) if (pageResult): print(pageResult)
class Model_Scraper_Keywords(Model_Scraper_Standard): def __init__(self, region): super(Model_Scraper_Keywords, self).__init__(region) self.region = region self.processor = Service_Functions().getProcessor('Keywords', region) def scraper(self, keywords): self.process = Model_Scraper_Standard(self.region) url = "https://www.amazon."+self.region+"/gp/search?keywords="+keywords+"&page=1" # 不显示浏览器 # with Display(backend="xvfb", size=(1440, 900)): print (url) try: content = self.process.processkeywords(url) except Exception as err: print (err) try: if (content): # 这边写解析代码 data = [] result = self.processor.process(content.encode('utf-8'), 1) if (result): # print (result) data.append(result) pagecount = int(self.processor.getPageCount(content)) if (pagecount > 5): pagecount = 5 # pagecount = 1 if (pagecount > 1): for i in range(2, pagecount + 1): pageurl = "https://www.amazon." + self.region + "/gp/search?keywords=" + keywords + "&page=" + str(i) print (pageurl) pagecontent = self.process.processkeywords(pageurl) if (pagecontent): pageresult = self.processor.process(pagecontent.encode('utf-8'), i) # print (pageresult) data.append(pageresult) return data elif (content == None): return None else: return False except: return False
class Model_Scraper_Sina_Auto(): def __init__(self): self.processor = Service_Functions().getSinaProcessor('Sina_Base') def scrape(self, url): self.process = Model_Scraper_Sina_Standard() html = self.process.process_auto(url) if (html): content = self.processor.process(html) return (content)
class Model_Scraper_Seller_Product(Model_Scraper_Standard): def __init__(self, region): self.region = region self.processor = Service_Functions().getProcessor( 'Seller_Product', region) def scrape(self, merchantId): if not merchantId: return False url = "https://www.amazon." + self.region + "/s?merchant=" + merchantId print(url) content = Model_Scraper_Standard(self.region).processSellerProduct(url) if (content): result = self.processor.process(content) if (result): data = [] data.append(result) pagecount = int(self.processor.getPageCount(content)) pagecount = 1 # 测试 if (pagecount > 1): if (pagecount > 50): pagecount = 50 # 测试 原为50 for i in range(2, pagecount + 1): pageurl = "https://www.amazon." + self.region + "/s?merchant=" + merchantId + "&page=" + str( i) print(pageurl) pageContent = Model_Scraper_Standard( self.region).processSellerProduct(pageurl) if not pageContent: continue pageResult = self.processor.process(pageContent) if (pageResult): data.append(pageResult) return data return Model_Static_DownloadQueue_Status().SCRAPED_NO_DATA return Model_Static_DownloadQueue_Status().FAILED
class Model_Scraper_Sina_Base(): def __init__(self): self.processor = Service_Functions().getSinaProcessor('Sina_Base') def scrape(self, url): self.process = Model_Scraper_Sina_Standard() html = self.process.process(url) if (html): content = self.processor.process(html) return (content) def process_content(self, html, url_id, url): scrape = Model_Processor_Sina_Content() content = scrape.process(html) content['url_id'] = url_id content['url'] = url return content
class Model_Scraper_Seller_Base(Model_Scraper_Standard): def __init__(self, region): self.region = region self.processor = Service_Functions().getProcessor( 'Seller_Base', region) def scrape(self, merchantId): if not merchantId: return False url = "http://www.amazon." + self.region + "/gp/aag/main?seller=" + merchantId content = Model_Scraper_Standard(self.region).processSeller(url) if (content): data = self.processor.process(content) if (data): return data return Model_Static_DownloadQueue_Status().SCRAPED_NO_DATA return Model_Static_DownloadQueue_Status().FAILED
class Model_Scraper_Tencent_Sports(): def __init__(self): self.processor = Service_Functions().getTencentProcessor( 'Tencent_Sports') def scrape(self, url): self.process = Model_Scraper_Tencent_Standard() html = self.process.process_gb2312(url) # print (html) if (html): content = self.processor.process(html) return (content) def process_content(self, html, url_id, url): scrape = Model_Processor_Tencent_Content() content = scrape.process(html) content['url_id'] = url_id content['url'] = url return content
class Model_Scraper_Product_Review(Model_Scraper_Standard): def __init__(self, region): self.region = region self.processor = Service_Functions().getProcessor( 'Product_Review', region) def scrape(self, asin, scrapedCount): self.process = Model_Scraper_Standard(self.region) url = "https://www.amazon." + self.region + "/gp/product-reviews/" + asin + "?sortBy=recent&pageNumber=1" print(url) content = self.process.processReview(url) if (content): data = {} items = [] summary = self.processor.getSummary(content.encode('utf-8')) if (summary): # print (summary) data['summary'] = summary # 处理首页数据 result = self.processor.process(content.encode('utf-8')) if (result): # print (result) items.append(result) newScrapedCount = 10 if (data['summary']['page_count'] and data['summary']['page_count'] > 0): # print (data['summary']['page_count']) pageCount = data['summary']['page_count'] # 已经抓取的页面 scrapedPageCount = int(floor(int(scrapedCount) / 10)) # print (scrapedPageCount) # 实际需要抓取的页面 pageCount = pageCount - scrapedPageCount # print (pageCount) if (pageCount > 20): pageCount = 2 if (pageCount >= 2): newScrapedCount = pageCount * 10 for i in range(2, pageCount + 1): pageUrl = "https://www.amazon." + self.region + "/gp/product-reviews/" + asin + "?sortBy=recent&pageNumber=" + str( i) print(pageUrl) pageContent = self.process.processReview(pageUrl) if not pageContent: continue # 处理page数据 pageResult = self.processor.process( pageContent.encode("utf-8")) if (pageResult): # print (pageResult) # items = [] items.append(pageResult) # print (items[0]) # print (items[1]) data['list'] = items if (len(data) > 0): data['new_scraped_count'] = newScrapedCount # print (data) return data else: return Model_Static_Scrape_Status.SUCCESS_NO_DATA elif (content == None): return None else: return False