def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
#             driver = webdriver.Firefox()
            LOGGER.debug("start extractor from %s" %(self.url, ))
            driver.get(self.url)
            
            list = [] #extract url list
            
            stop_flag = True #
            republishdThre = 5 #find 5 duplicated article stop extractor urls
            republishedCount = 0
         
            js = "var q=document.documentElement.scrollTop=8000"
            driver.execute_script(js)
            driver.implicitly_wait(0)
  
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            driver.implicitly_wait(0)
            
#             main_section = driver.find_element_by_css_selector("div[class=\"footer\"]")

#             link_list2 = driver.find_elements_by_css_selector("div[class=\"news-item\"]")
            link_list = driver.find_element_by_class_name("page1").find_elements_by_class_name("row")
#             link_list = link_list + link_list2
#                 link_list = main_section.find_elements_by_tag_name("h2")
            
            print len(link_list)
    
            for elem in link_list:
                title = elem.find_element_by_class_name("list-tt").find_element_by_tag_name("a").text # article title
                if title not in list:
                    LOGGER.debug("article title %s"%(title))
                    url = elem.find_element_by_class_name("list-tt").find_element_by_tag_name("a").get_attribute("href")
                    LOGGER.info("url:%s"%(url))
    
                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:
                        
                        abstract = elem.find_element_by_class_name("f4").text
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)
                        self.publishMsg(msg)
    
                    else: # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)
                        
                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" %(self.url, ))
            driver.get(self.url)
            
            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)

            list = [] #extract url list
            
            i = 0 #page count
            stop_flag = True #
            republishdThre = 5 #find 5 duplicated article stop extractor urls
            republishedCount = 0


            # find the article title section
#                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
            # find the article titles
            link_list = driver.find_element_by_class_name("fallsFlow").find_elements_by_css_selector("li[class=\"item masonry-brick\"]")

            for elem in link_list:
                article = elem.find_element_by_tag_name("h3")
                title = article.text # article title
                if title not in list:
                    LOGGER.debug("article title %s"%(title))
    #                         print title
                        
                    url = article.find_element_by_tag_name("a").get_attribute("href")
                    LOGGER.info("url:%s"%(url))
    
                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:
                        
                        abstract = elem.find_element_by_tag_name("h5").text
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)

                        self.publishMsg(msg)

                    else: # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)
                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
 def run(self):
     """
     """
     self.mysql_client = MysqlClient()
     self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)
     self.threhold = 5
     self.page = 14
     LOGGER.info("start re extractor the published url if count() < %s"%(self.threhold, ))
     failed_count = 0
     try:
         failed_count = self.mysql_client.getOne("select count(*) as c from published_url where count < %s", (self.threhold, ) )
     except Exception, e:
         LOGGER.error("failed to load the published url count")
         LOGGER.error(traceback.format_exc())
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" % (self.url,))
            driver.get(self.url)

            # scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            #             print driver.find_element_by_tag_name("body").text

            list = []  # extract url list

            i = 0  # page count
            stop_flag = True  #
            republishdThre = 5  # find 5 duplicated article stop extractor urls
            republishedCount = 0

            while i < 10 and stop_flag:

                # find the article title section
                #                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
                # find the article titles
                contents = driver.find_elements_by_class_name("list_f14d")
                for content in contents:
                    link_list = content.find_elements_by_tag_name("li")

                    for elem in link_list:
                        hrefs = elem.find_elements_by_tag_name("a")
                        title = hrefs[0].text  # article title
                        if title not in list:
                            LOGGER.debug("article title %s" % (title))

                            url = hrefs[0].get_attribute("href")
                            LOGGER.info("url:%s" % (url))

                            url_is_exists = self.isPublished(url)
                            if url_is_exists is False:

                                #                             abstract = elem.find_element_by_class_name("item-Text").text
                                abstract = ""
                                # published the url msg to mq
                                msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)
                                self.publishMsg(msg)

                            else:  # else the remain urls were already published
                                republishedCount += 1
                                if republishedCount >= republishdThre:
                                    stop_flag = False
                                    break
                            list.append(title)
                        else:
                            continue

                # load the next page
                next_page = driver.find_element_by_class_name("pages").find_elements_by_tag_name("a")[-1]
                next_page.click()
                driver.implicitly_wait(5)
                #                 print driver.find_element_by_tag_name("body").text
                i += 1

        except Exception, e:
            LOGGER.error(traceback.format_exc())
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" %(self.url, ))
            driver.get(self.url)
            
            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)

            list = [] #extract url list
            
            i = 0 #page count
            stop_flag = True #
            republishdThre = 5 #find 5 duplicated article stop extractor urls
            republishedCount = 0
            
#             next = 0
#             while next < 2:
#                 #click the next button three times to get the full article list
#                 
#                 next_page = driver.find_element_by_class_name("HomeMore")
#                 next_page_a = next_page.find_element_by_tag_name("a")
#                 print next_page.text
#                 next_page_a.click()
#                 driver.implicitly_wait(5)
#                 next += 1
                
            while i < 3 and stop_flag:

                # find the article title section
#                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
                # find the article titles
                link_list = driver.find_elements_by_css_selector("ul[class=\"pictxt block\"]")[i].find_elements_by_tag_name("li")
    
                for elem in link_list:
                    article = elem.find_element_by_class_name("tit")
                    title = article.text # article title
                    if title not in list:
                        LOGGER.debug("article title %s"%(title))
#                         print title
                        
                        url = article.find_element_by_tag_name("a").get_attribute("href")
                        LOGGER.info("url:%s"%(url))

                        url_is_exists = self.isPublished(url)
                        if url_is_exists is False:
                            
#                             abstract = elem.find_element_by_class_name("item-Text").text
                            abstract = elem.find_element_by_class_name("txt").text
                            # published the url msg to mq
                            msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)

                            self.publishMsg(msg)

                        else: # else the remain urls were already published
                            republishedCount += 1
                            if republishedCount >= republishdThre:
                                stop_flag = False
                                break
                        list.append(title)
                    else:
                        continue

                # load the next page
                next_page = driver.find_element_by_class_name("HomeMore").find_element_by_tag_name("a")
                next_page.click()
                driver.implicitly_wait(5)
                i += 1

        except Exception, e:
            LOGGER.error(traceback.format_exc())
    def process(self, msg):
        now = int(time.time())
        msg["id"] = time.time()

        if self.handlerRepository.process(self.message_key, msg):
            LOGGER.info('[Publish %d] received message: %s %s'   % (os.getpid(), self.message_key,msg))
Ejemplo n.º 7
0
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            #             driver = webdriver.Firefox()
            LOGGER.debug("start extractor from %s" % (self.url, ))
            driver.get(self.url)
            f = file("html", "w")
            source = driver.page_source.encode("utf-8")
            f.write(source)
            f.close()

            list = []  #extract url list

            stop_flag = True  #
            republishdThre = 5  #find 5 duplicated article stop extractor urls
            republishedCount = 0

            js = "var q=document.documentElement.scrollTop=8000"
            driver.execute_script(js)
            driver.implicitly_wait(0)

            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            driver.implicitly_wait(0)

            #             main_section = driver.find_element_by_css_selector("div[class=\"footer\"]")

            #             link_list2 = driver.find_elements_by_css_selector("div[class=\"news-item\"]")
            link_list = driver.find_element_by_class_name(
                "d_list_txt").find_elements_by_tag_name("li")
            #             link_list = link_list + link_list2
            #                 link_list = main_section.find_elements_by_tag_name("h2")

            print len(link_list)

            for elem in link_list:
                title = elem.text  # article title
                if title not in list:
                    LOGGER.debug("article title %s" % (title))
                    url = elem.find_element_by_class_name(
                        "c_tit").find_element_by_tag_name("a").get_attribute(
                            "href")
                    LOGGER.info("url:%s" % (url))

                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:

                        #                             abstract = elem.find_element_by_class_name("item-Text").text
                        abstract = ""
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag,
                                             title, abstract)
                        self.publishMsg(msg)

                    else:  # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)

                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
          "extractor": HuanQiuExtractor,
          "tag":"huanqiu",
          "sub_tag":"world",
          "period": 10
          },   
     ]


class Extractor(Daemon):
    
    def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull ):
        super(Extractor, self).__init__(pidfile , stdin, stdout, stderr)
    
    def run(self):
        """
        run the extractor use dict
        """
        try:
            LOGGER.debug("start the extractor")
            for elem in extractor_source_url_config:
                extractor = elem["extractor"](elem)
                extractor.extract_links()
        except Exception, e:
            LOGGER.error(traceback.format_exc())
        finally:
            LOGGER.info("finished extractor")

if __name__ == '__main__':
#     daemon_main(Extractor, './ ', sys.argv)
    extractor = Extractor("./")
    extractor.run()
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            #             driver.set_page_load_timeout(10)
            LOGGER.debug("start extractor from %s" % (self.url, ))
            try:
                driver.get(self.url)
            except Exception, e:
                pass

            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)

            list = []  #extract url list

            i = 0  #page count
            stop_flag = True  #
            republishdThre = 5  #find 5 duplicated article stop extractor urls
            republishedCount = 0

            while i < 10 and stop_flag:

                # find the article title section
                #                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
                # find the article titles
                link_list = driver.find_elements_by_css_selector(
                    "div[class=\"m-collist clearfix\"]")

                for elem in link_list:
                    article = elem.find_element_by_tag_name("dt")
                    title = article.text  # article title
                    if title not in list:
                        LOGGER.debug("article title %s" % (title))
                        #                         print title

                        url = elem.find_element_by_tag_name("a").get_attribute(
                            "href")
                        LOGGER.info("url:%s" % (url))

                        url_is_exists = self.isPublished(url)
                        if url_is_exists is False:

                            #                             abstract = elem.find_element_by_class_name("item-Text").text
                            abstract = elem.find_element_by_tag_name("dd").text
                            # published the url msg to mq
                            msg = self.formatMsg(url, self.tag, self.sub_tag,
                                                 title, abstract)
                            self.publishMsg(msg)

                        else:  # else the remain urls were already published
                            republishedCount += 1
                            if republishedCount >= republishdThre:
                                stop_flag = False
                                break
                        list.append(title)
                    else:
                        continue

                # load the next page
                next_page = driver.find_element_by_class_name("m-collist-more")
                next_page.click()
                driver.implicitly_wait(5)
                i += 1
Ejemplo n.º 10
0
    },
]


class Extractor(Daemon):
    def __init__(self,
                 pidfile,
                 stdin=os.devnull,
                 stdout=os.devnull,
                 stderr=os.devnull):
        super(Extractor, self).__init__(pidfile, stdin, stdout, stderr)

    def run(self):
        """
        run the extractor use dict
        """
        try:
            LOGGER.debug("start the extractor")
            for elem in extractor_source_url_config:
                extractor = elem["extractor"](elem)
                extractor.extract_links()
        except Exception, e:
            LOGGER.error(traceback.format_exc())
        finally:
            LOGGER.info("finished extractor")


if __name__ == '__main__':
    #     daemon_main(Extractor, './ ', sys.argv)
    extractor = Extractor("./")
    extractor.run()