def run(self): """ run the extractor use dict """ try: LOGGER.debug("start the extractor") for elem in extractor_source_url_config: extractor = elem["extractor"](elem) extractor.extract_links() except Exception, e: LOGGER.error(traceback.format_exc())
class FailedExtractor(Daemon): ''' classdocs ''' def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull): ''' Constructor ''' super(FailedExtractor, self).__init__(pidfile, stdin, stdout, stderr) def run(self): """ """ self.mysql_client = MysqlClient() self.news_publisher = NewsPublisher(NEWS_URL_QUEUE) self.threhold = 5 self.page = 14 LOGGER.info("start re extractor the failed url if count() < %s" % (self.threhold, )) failed_count = 0 try: failed_count = self.mysql_client.getOne( "select count(*) as c from failed_url where count < %s", (self.threhold, )) except Exception, e: LOGGER.error("failed to load the failed url count") LOGGER.error(traceback.format_exc()) failed_count = int(failed_count["c"]) count = 0 while count < failed_count: try: print count urls = self.mysql_client.getAll( "select * from failed_url where count < %s limit %s, %s", (self.threhold, count, self.page)) if urls == False: break count += len(urls) for url in urls: LOGGER.info("re extractor url: %s" % (url["url"], )) msg = self.mysql_client.getOne( "select abstract, title from published_url where url = %s", (url["url"], )) url["title"] = msg["title"] url["abstract"] = msg["abstract"] self.news_publisher.process(url) except Exception, e: LOGGER.error("re extractor urls error") LOGGER.error(traceback.format_exc())
def publishMsg(self, msg): try: self.news_publisher.process(msg) self.mysql_client.insertOne("insert into published_url(url, tag, sub_tag, version, create_time, title, abstract) values(%s, %s, %s, %s, %s, %s, %s)", \ (msg["url"], msg["tag"], msg["sub_tag"], msg["version"], msg["create_time"], msg["title"], msg.get("abstract", ""))) self.mysql_client.end("commit") except Exception, e: self.mysql_client.end("rollback") LOGGER.error("published msg error: %s" % (msg["url"], )) LOGGER.error(traceback.format_exc()) # def reTryFailedList(self): # pass
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) LOGGER.debug("start extractor from %s" % (self.url, )) driver.get(self.url) #scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) list = [] #extract url list i = 0 #page count stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 # find the article title section # link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]") # find the article titles link_list = driver.find_element_by_class_name( "fallsFlow").find_elements_by_css_selector( "li[class=\"item masonry-brick\"]") for elem in link_list: article = elem.find_element_by_tag_name("h3") title = article.text # article title if title not in list: LOGGER.debug("article title %s" % (title)) # print title url = article.find_element_by_tag_name("a").get_attribute( "href") LOGGER.info("url:%s" % (url)) url_is_exists = self.isPublished(url) if url_is_exists is False: abstract = elem.find_element_by_tag_name("h5").text # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) # driver = webdriver.Firefox() LOGGER.debug("start extractor from %s" %(self.url, )) driver.get(self.url) list = [] #extract url list stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 js = "var q=document.documentElement.scrollTop=8000" driver.execute_script(js) driver.implicitly_wait(0) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) driver.implicitly_wait(0) # main_section = driver.find_element_by_css_selector("div[class=\"footer\"]") # link_list2 = driver.find_elements_by_css_selector("div[class=\"news-item\"]") link_list = driver.find_element_by_class_name("page1").find_elements_by_class_name("row") # link_list = link_list + link_list2 # link_list = main_section.find_elements_by_tag_name("h2") print len(link_list) for elem in link_list: title = elem.find_element_by_class_name("list-tt").find_element_by_tag_name("a").text # article title if title not in list: LOGGER.debug("article title %s"%(title)) url = elem.find_element_by_class_name("list-tt").find_element_by_tag_name("a").get_attribute("href") LOGGER.info("url:%s"%(url)) url_is_exists = self.isPublished(url) if url_is_exists is False: abstract = elem.find_element_by_class_name("f4").text # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) # driver = webdriver.Firefox() LOGGER.debug("start extractor from %s" % (self.url, )) driver.get(self.url) list = [] #extract url list stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 js = "var q=document.documentElement.scrollTop=8000" driver.execute_script(js) driver.implicitly_wait(0) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) driver.implicitly_wait(0) link_list = driver.find_element_by_class_name( "list").find_elements_by_class_name("item") print len(link_list) for elem in link_list: title = elem.find_element_by_tag_name( "h1").find_element_by_tag_name("a").text # article title if title not in list: LOGGER.debug("article title %s" % (title)) url = elem.find_element_by_tag_name( "h1").find_element_by_tag_name("a").get_attribute( "href") LOGGER.info("url:%s" % (url)) url_is_exists = self.isPublished(url) if url_is_exists is False: abstract = elem.find_element_by_tag_name("p").text # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) LOGGER.debug("start extractor from %s" %(self.url, )) driver.get(self.url) #scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) list = [] #extract url list i = 0 #page count stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 # find the article title section # link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]") # find the article titles link_list = driver.find_element_by_class_name("fallsFlow").find_elements_by_css_selector("li[class=\"item masonry-brick\"]") for elem in link_list: article = elem.find_element_by_tag_name("h3") title = article.text # article title if title not in list: LOGGER.debug("article title %s"%(title)) # print title url = article.find_element_by_tag_name("a").get_attribute("href") LOGGER.info("url:%s"%(url)) url_is_exists = self.isPublished(url) if url_is_exists is False: abstract = elem.find_element_by_tag_name("h5").text # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue except Exception, e: LOGGER.error(traceback.format_exc())
def run(self): """ """ self.mysql_client = MysqlClient() self.news_publisher = NewsPublisher(NEWS_URL_QUEUE) self.threhold = 5 self.page = 14 LOGGER.info("start re extractor the published url if count() < %s"%(self.threhold, )) failed_count = 0 try: failed_count = self.mysql_client.getOne("select count(*) as c from published_url where count < %s", (self.threhold, ) ) except Exception, e: LOGGER.error("failed to load the published url count") LOGGER.error(traceback.format_exc())
def run(self): """ """ self.mysql_client = MysqlClient() self.news_publisher = NewsPublisher(NEWS_URL_QUEUE) self.threhold = 5 self.page = 14 LOGGER.info("start re extractor the failed url if count() < %s" % (self.threhold, )) failed_count = 0 try: failed_count = self.mysql_client.getOne( "select count(*) as c from failed_url where count < %s", (self.threhold, )) except Exception, e: LOGGER.error("failed to load the failed url count") LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) LOGGER.debug("start extractor from %s" % (self.url,)) driver.get(self.url) # scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) # print driver.find_element_by_tag_name("body").text list = [] # extract url list i = 0 # page count stop_flag = True # republishdThre = 5 # find 5 duplicated article stop extractor urls republishedCount = 0 while i < 10 and stop_flag: # find the article title section # link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]") # find the article titles contents = driver.find_elements_by_class_name("list_f14d") for content in contents: link_list = content.find_elements_by_tag_name("li") for elem in link_list: hrefs = elem.find_elements_by_tag_name("a") title = hrefs[0].text # article title if title not in list: LOGGER.debug("article title %s" % (title)) url = hrefs[0].get_attribute("href") LOGGER.info("url:%s" % (url)) url_is_exists = self.isPublished(url) if url_is_exists is False: # abstract = elem.find_element_by_class_name("item-Text").text abstract = "" # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue # load the next page next_page = driver.find_element_by_class_name("pages").find_elements_by_tag_name("a")[-1] next_page.click() driver.implicitly_wait(5) # print driver.find_element_by_tag_name("body").text i += 1 except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) LOGGER.debug("start extractor from %s" %(self.url, )) driver.get(self.url) #scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) list = [] #extract url list i = 0 #page count stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 # next = 0 # while next < 2: # #click the next button three times to get the full article list # # next_page = driver.find_element_by_class_name("HomeMore") # next_page_a = next_page.find_element_by_tag_name("a") # print next_page.text # next_page_a.click() # driver.implicitly_wait(5) # next += 1 while i < 3 and stop_flag: # find the article title section # link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]") # find the article titles link_list = driver.find_elements_by_css_selector("ul[class=\"pictxt block\"]")[i].find_elements_by_tag_name("li") for elem in link_list: article = elem.find_element_by_class_name("tit") title = article.text # article title if title not in list: LOGGER.debug("article title %s"%(title)) # print title url = article.find_element_by_tag_name("a").get_attribute("href") LOGGER.info("url:%s"%(url)) url_is_exists = self.isPublished(url) if url_is_exists is False: # abstract = elem.find_element_by_class_name("item-Text").text abstract = elem.find_element_by_class_name("txt").text # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue # load the next page next_page = driver.find_element_by_class_name("HomeMore").find_element_by_tag_name("a") next_page.click() driver.implicitly_wait(5) i += 1 except Exception, e: LOGGER.error(traceback.format_exc())
class NetEaseExtractorPlay(BaseExtractor): def __init__(self, config): super(NetEaseExtractorPlay, self).__init__(config) def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) # driver.set_page_load_timeout(10) LOGGER.debug("start extractor from %s" % (self.url, )) try: driver.get(self.url) except Exception, e: pass #scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) list = [] #extract url list i = 0 #page count stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 while i < 10 and stop_flag: # find the article title section # link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]") # find the article titles link_list = driver.find_elements_by_css_selector( "div[class=\"m-collist clearfix\"]") for elem in link_list: article = elem.find_element_by_tag_name("dt") title = article.text # article title if title not in list: LOGGER.debug("article title %s" % (title)) # print title url = elem.find_element_by_tag_name("a").get_attribute( "href") LOGGER.info("url:%s" % (url)) url_is_exists = self.isPublished(url) if url_is_exists is False: # abstract = elem.find_element_by_class_name("item-Text").text abstract = elem.find_element_by_tag_name("dd").text # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue # load the next page next_page = driver.find_element_by_class_name("m-collist-more") next_page.click() driver.implicitly_wait(5) i += 1 except Exception, e: LOGGER.error(traceback.format_exc())
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) LOGGER.debug("start extractor from %s" %(self.url, )) driver.get(self.url) #scroll bar set from bottom to top, make the page load all js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) # print driver.find_element_by_tag_name("body").text list = [] #extract url list i = 0 #page count stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 while i < 10 and stop_flag: # find the article title section # link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]") # find the article titles contents = driver.find_elements_by_class_name("list_txt") for content in contents: link_list = content.find_elements_by_tag_name("li") for elem in link_list: hrefs = elem.find_elements_by_tag_name("a") title = hrefs[1].text # article title if title not in list: LOGGER.debug("article title %s"%(title)) # print title url = hrefs[1].get_attribute("href") LOGGER.info("url:%s"%(url)) url_is_exists = self.isPublished(url) if url_is_exists is False: # abstract = elem.find_element_by_class_name("item-Text").text abstract = "" # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue # load the next page next_page = driver.find_elements_by_class_name("bar_pages_flip")[1] next_page.click() driver.implicitly_wait(5) # print driver.find_element_by_tag_name("body").text i += 1 except Exception, e: LOGGER.error(traceback.format_exc())
"extractor": HuanQiuExtractor, "tag":"huanqiu", "sub_tag":"world", "period": 10 }, ] class Extractor(Daemon): def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull ): super(Extractor, self).__init__(pidfile , stdin, stdout, stderr) def run(self): """ run the extractor use dict """ try: LOGGER.debug("start the extractor") for elem in extractor_source_url_config: extractor = elem["extractor"](elem) extractor.extract_links() except Exception, e: LOGGER.error(traceback.format_exc()) finally: LOGGER.info("finished extractor") if __name__ == '__main__': # daemon_main(Extractor, './ ', sys.argv) extractor = Extractor("./") extractor.run()
def process(self, msg): now = int(time.time()) msg["id"] = time.time() if self.handlerRepository.process(self.message_key, msg): LOGGER.info('[Publish %d] received message: %s %s' % (os.getpid(), self.message_key,msg))
def extract_links(self): try: driver = webdriver.PhantomJS(PHANTOMJS_PATH) # driver = webdriver.Firefox() LOGGER.debug("start extractor from %s" % (self.url, )) driver.get(self.url) f = file("html", "w") source = driver.page_source.encode("utf-8") f.write(source) f.close() list = [] #extract url list stop_flag = True # republishdThre = 5 #find 5 duplicated article stop extractor urls republishedCount = 0 js = "var q=document.documentElement.scrollTop=8000" driver.execute_script(js) driver.implicitly_wait(0) js = "var q=document.documentElement.scrollTop=0" driver.execute_script(js) driver.implicitly_wait(0) # main_section = driver.find_element_by_css_selector("div[class=\"footer\"]") # link_list2 = driver.find_elements_by_css_selector("div[class=\"news-item\"]") link_list = driver.find_element_by_class_name( "d_list_txt").find_elements_by_tag_name("li") # link_list = link_list + link_list2 # link_list = main_section.find_elements_by_tag_name("h2") print len(link_list) for elem in link_list: title = elem.text # article title if title not in list: LOGGER.debug("article title %s" % (title)) url = elem.find_element_by_class_name( "c_tit").find_element_by_tag_name("a").get_attribute( "href") LOGGER.info("url:%s" % (url)) url_is_exists = self.isPublished(url) if url_is_exists is False: # abstract = elem.find_element_by_class_name("item-Text").text abstract = "" # published the url msg to mq msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract) self.publishMsg(msg) else: # else the remain urls were already published republishedCount += 1 if republishedCount >= republishdThre: stop_flag = False break list.append(title) else: continue except Exception, e: LOGGER.error(traceback.format_exc())
}, ] class Extractor(Daemon): def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull): super(Extractor, self).__init__(pidfile, stdin, stdout, stderr) def run(self): """ run the extractor use dict """ try: LOGGER.debug("start the extractor") for elem in extractor_source_url_config: extractor = elem["extractor"](elem) extractor.extract_links() except Exception, e: LOGGER.error(traceback.format_exc()) finally: LOGGER.info("finished extractor") if __name__ == '__main__': # daemon_main(Extractor, './ ', sys.argv) extractor = Extractor("./") extractor.run()