def __init__(self, entrance_url, product_type, storage_table): AGraber.__init__(self) self.db = DBMysql() self.entrance_url = entrance_url self.product_type = product_type # self.searcher_xpath_value = searcher_xpath_value # self.searcher_submit_button = searcher_submit_button self.storage_table = storage_table pass
def __init__(self, entrance_url, product_type, storage_table): AEbayGraber.__init__(self) logging.basicConfig( level=logging.INFO, format= '%(asctime)s %(module)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y %b %d %H:%M:%S', filename='./app.Log', filemode='w') self.db = DBMysql() self.entrance_url = entrance_url self.product_type = product_type self.storage_table = storage_table pass
class GrabEbuyerPhone(AGraber): def __init__(self, entrance_url, product_type, storage_table): AGraber.__init__(self) self.db = DBMysql() self.entrance_url = entrance_url self.product_type = product_type # self.searcher_xpath_value = searcher_xpath_value # self.searcher_submit_button = searcher_submit_button self.storage_table = storage_table pass def handle_one_page(self, driver): """重载父类方法,实现具体的爬虫操作""" url = self.entrance_url keyword = self.product_type driver.get(url) time.sleep(10) print "Inittial Page:", url # driver = self.submit_initial_url(driver, "//input[@type='text']", "gh-btn", keyword) # ebay driver = self.submit_initial_url(driver, "//*[@id='search-box']", "search-button", keyword) i = 0 while i < 500: # 获取当前网页html文档 response_html = self.get_htmlcontent(driver.current_url) try: if response_html.status_code is not 200: print "Get status_code, but Exception:response_html.status_code=", response_html.status_code break except: print "Exception:response_html.status_code=", response_html.status_code break # 仅提取内容部分的文档,方便解析提速 html_part_id_value = "main-content" only_content_tags = SoupStrainer("section", id=html_part_id_value) html_part_content = BeautifulSoup( response_html.text, "html.parser", parse_only=only_content_tags).prettify() # 解析所需的所有链接 soup = BeautifulSoup(html_part_content, "html.parser", from_encoding="utf-8") links = soup.find_all('a', class_="view-product", href=re.compile(self.product_type, re.I)) for link in links: # 产生新的url的方式不同 url_parse = urlparse(url) domain = url_parse.scheme + "://" + url_parse.netloc new_url = domain + link['href'] self.handle_result_url(new_url, keyword, i) time.sleep(10) i = i + 1 # current_page = "a.pg curr" # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished" # nextPage = "a.gspr.next" try: # driver.find_element_by_css_selector(nextPage).click() #ebay driver.find_element_by_xpath( "//*[@id='main-content']/div/div[1]/div[2]/div[1]/ul/li[6]/a" ).click() except: break print driver.current_url time.sleep(20) driver.quit() self.db.close() # 加个关键词,传入type def handle_result_url(self, item_url, keyword, i): print "Handle", i, "th URL:", item_url response_html = self.get_htmlcontent(item_url) # if response_html is not None: if response_html.status_code == 200: try: item_domain = urlparse(item_url).scheme + "://www." + urlparse( item_url).netloc item_content = response_html.text.replace('\"', ' ') html_content = BeautifulSoup(response_html.text, "html.parser") item_title = html_content.title.string.replace('\"', ' ') new_reconrd = { "id": str(uuid.uuid4()), "domain_name": item_domain, "keyword": keyword, "url": item_url, "title": item_title, "doc": self.db.escape_string(unicode(item_content)) } if "| eBay" is not item_title: self.db.insertOneData(self.storage_table, new_reconrd) except: pass else: print "Handle", i, "th URL:", item_url, "Failed!..............................................."
class GrabSearsPhone(AGraber): def __init__(self, entrance_url, product_type, storage_table): AGraber.__init__(self) self.db = DBMysql() self.entrance_url = entrance_url self.product_type = product_type # self.searcher_xpath_value = searcher_xpath_value # self.searcher_submit_button = searcher_submit_button self.storage_table = storage_table pass def run(self): # 使用phantomjs虚拟浏览器 driver = self.get_driver("chrome") self.handle_one_page(driver) def handle_one_page(self, driver): """重载父类方法,实现具体的爬虫操作""" url = self.entrance_url keyword = self.product_type driver.get(url) time.sleep(10) print "Inittial Page:", url # # driver = self.submit_initial_url(driver, "//input[@type='text']", "gh-btn", keyword) # ebay # #driver = self.submit_initial_url(driver, "//input[@type='search']", "search-button", keyword) # # driver.find_element_by_xpath("//*[@id='keyword']").clear() # driver.find_element_by_xpath("//*[@id='keyword']").send_keys(keyword) # # # 获取按钮对象并点击按钮 # # elem = driver.find_element_by_id("search-button") # ebay ebuyer中使用该类型 # # driver.find_element_by_xpath("//*[@id='goBtn']").click() # driver.find_element_by_id("goBtn").click() # # time.sleep(20) # 需要暂停一两秒,防止页面未跳转 # print "Get Crawer Home Page:", driver.current_url inputBoxXpath = "//*[@id='keyword']" driver.find_element_by_xpath(inputBoxXpath).clear() # elem = driver.find_element_by_id("keyword").send_keys(keyword) driver.find_element_by_xpath(inputBoxXpath).send_keys(keyword) # elem = driver.find_element_by_xpath(inputBoxXpath) # elem.clear() # elem = driver.find_element_by_xpath(inputBoxXpath) # elem.send_keys(keyword) # 获取按钮对象并点击按钮 # searchButton = 'search-button' # elem = driver.find_element_by_css_selector('button.btn.btn-default') # elem = driver.find_element_by_css_selector('input.btn.btn-prim gh-spr') elem = driver.find_element_by_id("goBtn") #elem = driver.find_element_by_css_selector('# goBtn') # goBtn # elem = driver.find_element_by_xpath("//*[@id='goBtn']") elem.click() time.sleep(10) # 需要暂停一两秒,防止页面未跳转 print "Get Crawer Home Page:", driver.current_url i = 0 while i < 21: # 获取当前网页html文档 response_html = self.get_htmlcontent(driver.current_url) # if response_html is not None: try: if response_html.status_code is not 200: print "Get status_code, but Exception:response_html.status_code=", response_html.status_code break except: print "Exception:response_html.status_code=", response_html.status_code break # 仅提取内容部分的文档,方便解析提速 # html_part_id_value = "content" # # only_content_tags = SoupStrainer("ul", id=html_part_id_value) # only_content_tags = SoupStrainer("div", id=html_part_id_value) # html_part_content = BeautifulSoup(response_html.text, "html.parser", parse_only=only_content_tags).prettify() # # # 解析所需的所有链接 # soup = BeautifulSoup(html_part_content, "html.parser", from_encoding="utf-8") # # links = soup.find_all('a', href=re.compile("phone", re.I)) soup = BeautifulSoup(response_html.text, "html.parser", from_encoding="utf-8") #links = soup.find_all('a', href=re.compile(r"phone", re.I)) links = soup.find_all('a') for link in links: url_parse = urlparse(url) domain = url_parse.scheme + "://" + url_parse.netloc new_url = domain + link['href'] self.handle_result_url(new_url, keyword, i) print "Fetch a phone url :", new_url time.sleep(10) i = i + 1 # current_page = "a.pg curr" # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished" try: # nextPage = "a.gspr.next" # driver.find_element_by_css_selector(nextPage).click() #ebay # 获取下一页的链接 driver.find_element_by_xpath( "//*[@id='pagination']/div[1]/div[2]/a/span").click() except: break print driver.current_url time.sleep(20) driver.quit() self.db.close() # 加个关键词,传入type def handle_result_url(self, item_url, keyword, i): print "Handle", i, "Page's URL:", item_url response_html = self.get_htmlcontent(item_url) # if response_html is not None: if response_html.status_code == 200: try: item_domain = urlparse(item_url).scheme + "://www." + urlparse( item_url).netloc item_content = response_html.text.replace('\"', ' ') html_content = BeautifulSoup(response_html.text, "html.parser") item_title = html_content.title.string.replace('\"', ' ') new_reconrd = { "id": str(uuid.uuid4()), "domain_name": item_domain, "keyword": keyword, "url": item_url, "title": item_title, "doc": self.db.escape_string(unicode(item_content)) } if "| eBay" is not item_title: self.db.insertOneData(self.storage_table, new_reconrd) except: pass else: print "Handle", i, "Page's URL:", item_url, "Failed!..............................................."
class EbayGraber(AEbayGraber): def __init__(self, entrance_url, product_type, storage_table): AEbayGraber.__init__(self) logging.basicConfig( level=logging.INFO, format= '%(asctime)s %(module)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%Y %b %d %H:%M:%S', filename='./app.Log', filemode='w') self.db = DBMysql() self.entrance_url = entrance_url self.product_type = product_type self.storage_table = storage_table pass def get_soup_responser(self, html_text_content, html_psrser="html.parser", from_encoding="utf-8"): """根据网页内容,构建soup对象,并返回""" return BeautifulSoup(html_text_content, html_psrser, from_encoding=from_encoding) def get_strainer_soup_responser(self, html_text_content, strainer_symbol, html_psrser="html.parser", from_encoding="utf-8"): """根据由SoupStrainer限定的部分网页内容,构建soup对象,并返回""" only_content_tags = SoupStrainer( id=strainer_symbol ) # todo:修改soupStrainer为多种构造方式自选,使用枚举类型、传入参数、switch确定 only_content = BeautifulSoup(html_text_content, html_psrser, parse_only=only_content_tags).prettify() return BeautifulSoup(only_content, html_psrser, from_encoding=from_encoding) def web_page_paser(self, driver): """解析所有有效的网页内容""" keyword = self.product_type driver.get(self.entrance_url) time.sleep(10) print "Inittial Page:", self.entrance_url driver = self.submit_initial_url(driver, "//input[@type='text']", "gh-btn", keyword) i = 0 while i < 500: request_responser = self.get_request_responser(driver.current_url) if request_responser is None: break soup = self.get_strainer_soup_responser(request_responser.text, "CenterPanel") links = soup.find_all('a', href=re.compile(keyword)) print links for link in links: new_url = link['href'] self.handle_one_url(new_url, keyword, i) time.sleep(10) i = i + 1 # current_page = "a.pg curr" # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished" try: driver.find_element_by_css_selector("a.gspr.next").click() print "Geting a new page,url=%s", driver.current_url time.sleep(20) except: print "Error:Geting next page Failed", request_responser.status_code break driver.quit() self.db.close() # 加个关键词,传入type def handle_one_url(self, url, keyword, i): logging.info("Handle the" + str(i) + "th URL" + url) response_html = self.get_request_responser(url) try: if response_html.status_code is not 200: pass except: print "LOG:Exception:EbayGraber %s handle_one_url response_html.status_code=%s! url=%s", self.product_type, response_html.status_code, url pass if response_html.text is not None: new_reconrd = self.get_db_product_db_object( url, response_html.text) if new_reconrd is not None: self.db.insertOneData(self.storage_table, new_reconrd) else: print "LOG:Warning:EbayGraber %s handle_one_url response_html.text is None! url=%s" % self.product_type, url def get_db_product_db_object(self, url, response_html_text): """解析抽取到的商品链接,判断是否是所需处理的网页,返回一个可以入库的字典""" soup = BeautifulSoup(response_html_text, "html.parser") try: item_title = "" if soup.title is not None: if self.product_type not in soup.title.string or "camera | eBay" in soup.title.string: logging.warn("This is not a " + self.product_type + " product url! url=" + url) return None item_title = soup.title.string.replace('\"', ' ') item_content = "" if response_html_text is not None: item_content = response_html_text.replace('\"', ' ') new_record = { "id": str(uuid.uuid4()), "domain_name": urlparse(url).scheme + "://www." + urlparse(url).netloc, "keyword": self.product_type, "url": url, "title": item_title, "doc": self.db.escape_string(unicode(item_content)) } return new_record except: pass
class GrabOverStockPhone(AGraber): def __init__(self, entrance_url, product_type, storage_table): AGraber.__init__(self) self.db = DBMysql() self.entrance_url = entrance_url self.product_type = product_type # self.searcher_xpath_value = searcher_xpath_value # self.searcher_submit_button = searcher_submit_button self.storage_table = storage_table pass def handle_one_page(self, driver): """重载父类方法,实现具体的爬虫操作""" url = self.entrance_url keyword = self.product_type driver.get(url) time.sleep(120) try: driver.find_element_by_id("cboxClose").click() print "Click the pop - up windows" except: pass time.sleep(60) print "Inittial Page:", url # driver = self.submit_initial_url(driver, "//input[@type='text']", "gh-btn", keyword) # ebay #driver = self.submit_initial_url(driver, "//input[@type='search']", "search-button", keyword) driver.find_element_by_xpath("//*[@id='search-input']").clear() driver.find_element_by_xpath("//*[@id='search-input']").send_keys( keyword) # 获取按钮对象并点击按钮 # elem = driver.find_element_by_id(submit_key) # ebay ebuyer中使用该类型 elem = driver.find_element_by_xpath( '//*[@id="search-form"]/fieldset[2]/label/i') elem.click() time.sleep(60) # 翻页的一种变种,将页面滚动条拖到底部 i = 0 while i < 200: ################################################################################### 2000 js = "var q=document.body.scrollTop=" + str(10000 * i) print i, js driver.execute_script(js) time.sleep(3) i = i + 1 # 获取当前网页html文档 print "Get Crawer Home Page:", driver.current_url response_html = self.get_htmlcontent(driver.current_url) try: if response_html.status_code is not 200: print "Get status_code, but Exception:response_html.status_code=", response_html.status_code pass except: print "Exception:response_html.status_code=", response_html.status_code pass # 仅提取内容部分的文档,方便解析提速 html_part_id_value = "result-products" only_content_tags = SoupStrainer(id=html_part_id_value) html_part_content = BeautifulSoup( response_html.text, "html.parser", parse_only=only_content_tags).prettify() # 解析所需的所有链接 soup = BeautifulSoup(html_part_content, "html.parser", from_encoding="utf-8") # links = soup.find_all('a', class_='jsQs', href=re.compile(self.product_type, re.I)) if keyword is "cellphone": url_judger = "phone" elif keyword is "tv": url_judger = "tv" elif keyword is "digit camera": url_judger = "camera" elif keyword is "bike": url_judger = "bike" elif keyword is "labtop": url_judger = "labtop" elif keyword is "mice": url_judger = "mice" elif keyword is "Webcams": url_judger = "Webcams" elif keyword is "shaver": url_judger = "shaver" elif keyword is "flashlight": url_judger = "flashlight" elif keyword is "watch": url_judger = "watch" links = soup.find_all('a', href=re.compile( url_judger, re.I)) # 注意,这里的phone,不一定等于keyword了!!!!!!!!!!!!!!!!!!!!! print len(links) for link in links: new_url = link['href'] self.handle_result_url(new_url, keyword, i) time.sleep(10) # current_page = "a.pg curr" # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished" # try: # # nextPage = "a.gspr.next" # # driver.find_element_by_css_selector(nextPage).click() #ebay # # # driver.find_element_by_xpath("//*[@id='main-content']/div/div[1]/div[2]/div[1]/ul/li[6]/a").click() # Cdicount # # nextPage = "a.jsNxtPage.pgNext" # driver.find_element_by_css_selector(nextPage).click() # except: # break # print driver.current_url time.sleep(20) driver.quit() self.db.close() # 加个关键词,传入type def handle_result_url(self, item_url, keyword, i): print "Handle", i, "Page's URL:", item_url response_html = self.get_htmlcontent(item_url) if response_html.status_code == 200: try: item_domain = urlparse(item_url).scheme + "://www." + urlparse( item_url).netloc item_content = response_html.text.replace('\"', ' ') html_content = BeautifulSoup(response_html.text, "html.parser") item_title = html_content.title.string.replace('\"', ' ') new_reconrd = { "id": str(uuid.uuid4()), "domain_name": item_domain, "keyword": keyword, "url": item_url, "title": item_title, "doc": self.db.escape_string(unicode(item_content)) } if "| eBay" is not item_title: self.db.insertOneData(self.storage_table, new_reconrd) except: pass else: print "Handle", i, "Page's URL:", item_url, "Failed!..............................................."
class GrabCdiscountPhone(AGraber): def __init__(self, entrance_url, product_type, storage_table): AGraber.__init__(self) self.db = DBMysql() self.entrance_url = entrance_url self.product_type = product_type # self.searcher_xpath_value = searcher_xpath_value # self.searcher_submit_button = searcher_submit_button self.storage_table = storage_table pass def handle_one_page(self, driver): """重载父类方法,实现具体的爬虫操作""" url = self.entrance_url keyword = self.product_type driver.get(url) time.sleep(10) print "Inittial Page:", url # driver = self.submit_initial_url(driver, "//input[@type='text']", "gh-btn", keyword) # ebay #driver = self.submit_initial_url(driver, "//input[@type='search']", "search-button", keyword) driver.find_element_by_xpath("//input[@type='search']").clear() driver.find_element_by_xpath("//input[@type='search']").send_keys( keyword) # 获取按钮对象并点击按钮 # elem = driver.find_element_by_id(submit_key) # ebay ebuyer中使用该类型 elem = driver.find_element_by_xpath( '//*[@id="hFull"]/div[2]/div[1]/button') elem.click() time.sleep(20) # 需要暂停一两秒,防止页面未跳转 print "Get Crawer Home Page:", driver.current_url i = 0 while i < 500: # 获取当前网页html文档 response_html = self.get_htmlcontent(driver.current_url) try: if response_html.status_code is not 200: print "Get status_code, but Exception:response_html.status_code=", response_html.status_code break except: print "Exception:response_html.status_code=", response_html.status_code break # 仅提取内容部分的文档,方便解析提速 html_part_id_value = "lpBloc" # only_content_tags = SoupStrainer("ul", id=html_part_id_value) only_content_tags = SoupStrainer(id=html_part_id_value) html_part_content = BeautifulSoup( response_html.text, "html.parser", parse_only=only_content_tags).prettify() # 解析所需的所有链接 soup = BeautifulSoup(html_part_content, "html.parser", from_encoding="utf-8") # links = soup.find_all('a', class_='jsQs', href=re.compile(self.product_type, re.I)) links = soup.find_all('a', class_='jsQs') # , href=re.compile("Phone") for link in links: new_url = link['href'] self.handle_result_url(new_url, keyword, i) time.sleep(10) i = i + 1 # current_page = "a.pg curr" # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished" try: # nextPage = "a.gspr.next" # driver.find_element_by_css_selector(nextPage).click() #ebay # driver.find_element_by_xpath("//*[@id='main-content']/div/div[1]/div[2]/div[1]/ul/li[6]/a").click() # Cdicount nextPage = "a.jsNxtPage.pgNext" driver.find_element_by_css_selector(nextPage).click() print driver.current_url time.sleep(20) except: print "Exception:Get Next page Fail", response_html.status_code break driver.quit() self.db.close() # 加个关键词,传入type def handle_result_url(self, item_url, keyword, i): print "Handle", i, "Page's URL:", item_url response_html = self.get_htmlcontent(item_url) try: if response_html.status_code is not 200: print "Get status_code, but Exception:response_html.status_code=", response_html.status_code pass except: print "Exception:response_html.status_code=", response_html.status_code pass try: item_domain = urlparse(item_url).scheme + "://www." + urlparse( item_url).netloc item_content = response_html.text.replace('\"', ' ') html_content = BeautifulSoup(response_html.text, "html.parser") item_title = html_content.title.string.replace('\"', ' ') new_reconrd = { "id": str(uuid.uuid4()), "domain_name": item_domain, "keyword": keyword, "url": item_url, "title": item_title, "doc": self.db.escape_string(unicode(item_content)) } if "| eBay" is not item_title: self.db.insertOneData(self.storage_table, new_reconrd) except: pass else: print "Handle", i, "Page's URL:", item_url, "Failed!..............................................."
def GetNewKeyWord(): productArray = [] file = open("Bikes.txt") while 1: line = file.readline() if not line: break productArray.append(line) return productArray if __name__ == '__main__': db = DBMysql() # sql = 'select * from keyword_info' # results = db.query(sql, "all") # for r in results: # item = keyword_Info_item() # item.keyword =r[0] # item.name = r[1] # keyword_info_lists.append(item) # for i in keyword_info_lists: # print i.keyword, i.name # productArray = GetNewKeyWord() # productArrayLen = len(productArray) # for i in range(productArrayLen):