コード例 #1
0
 def run(self):
     """
     run the extractor use dict
     """
     try:
         LOGGER.debug("start the extractor")
         for elem in extractor_source_url_config:
             extractor = elem["extractor"](elem)
             extractor.extract_links()
     except Exception, e:
         LOGGER.error(traceback.format_exc())
コード例 #2
0
 def run(self):
     """
     run the extractor use dict
     """
     try:
         LOGGER.debug("start the extractor")
         for elem in extractor_source_url_config:
             extractor = elem["extractor"](elem)
             extractor.extract_links()
     except Exception, e:
         LOGGER.error(traceback.format_exc())
コード例 #3
0
class FailedExtractor(Daemon):
    '''
    classdocs
    '''
    def __init__(self,
                 pidfile,
                 stdin=os.devnull,
                 stdout=os.devnull,
                 stderr=os.devnull):
        '''
        Constructor
        '''
        super(FailedExtractor, self).__init__(pidfile, stdin, stdout, stderr)

    def run(self):
        """
        """
        self.mysql_client = MysqlClient()
        self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)
        self.threhold = 5
        self.page = 14
        LOGGER.info("start re extractor the failed url if count() < %s" %
                    (self.threhold, ))
        failed_count = 0
        try:
            failed_count = self.mysql_client.getOne(
                "select count(*) as c from failed_url where count < %s",
                (self.threhold, ))
        except Exception, e:
            LOGGER.error("failed to load the failed url count")
            LOGGER.error(traceback.format_exc())
        failed_count = int(failed_count["c"])
        count = 0
        while count < failed_count:
            try:
                print count
                urls = self.mysql_client.getAll(
                    "select * from failed_url where count < %s limit %s, %s",
                    (self.threhold, count, self.page))
                if urls == False:
                    break
                count += len(urls)
                for url in urls:
                    LOGGER.info("re extractor url: %s" % (url["url"], ))
                    msg = self.mysql_client.getOne(
                        "select abstract, title from published_url where url = %s",
                        (url["url"], ))
                    url["title"] = msg["title"]
                    url["abstract"] = msg["abstract"]
                    self.news_publisher.process(url)
            except Exception, e:
                LOGGER.error("re extractor urls error")
                LOGGER.error(traceback.format_exc())
コード例 #4
0
    def publishMsg(self, msg):
        try:
            self.news_publisher.process(msg)
            self.mysql_client.insertOne("insert into published_url(url, tag, sub_tag, version, create_time, title, abstract) values(%s, %s, %s, %s, %s, %s, %s)", \
                                         (msg["url"], msg["tag"], msg["sub_tag"], msg["version"], msg["create_time"], msg["title"], msg.get("abstract", "")))
            self.mysql_client.end("commit")
        except Exception, e:
            self.mysql_client.end("rollback")
            LOGGER.error("published msg error: %s" % (msg["url"], ))
            LOGGER.error(traceback.format_exc())


#     def reTryFailedList(self):
#         pass
コード例 #5
0
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" % (self.url, ))
            driver.get(self.url)

            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)

            list = []  #extract url list

            i = 0  #page count
            stop_flag = True  #
            republishdThre = 5  #find 5 duplicated article stop extractor urls
            republishedCount = 0

            # find the article title section
            #                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
            # find the article titles
            link_list = driver.find_element_by_class_name(
                "fallsFlow").find_elements_by_css_selector(
                    "li[class=\"item masonry-brick\"]")

            for elem in link_list:
                article = elem.find_element_by_tag_name("h3")
                title = article.text  # article title
                if title not in list:
                    LOGGER.debug("article title %s" % (title))
                    #                         print title

                    url = article.find_element_by_tag_name("a").get_attribute(
                        "href")
                    LOGGER.info("url:%s" % (url))

                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:

                        abstract = elem.find_element_by_tag_name("h5").text
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag,
                                             title, abstract)

                        self.publishMsg(msg)

                    else:  # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)
                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
コード例 #6
0
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
#             driver = webdriver.Firefox()
            LOGGER.debug("start extractor from %s" %(self.url, ))
            driver.get(self.url)
            
            list = [] #extract url list
            
            stop_flag = True #
            republishdThre = 5 #find 5 duplicated article stop extractor urls
            republishedCount = 0
         
            js = "var q=document.documentElement.scrollTop=8000"
            driver.execute_script(js)
            driver.implicitly_wait(0)
  
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            driver.implicitly_wait(0)
            
#             main_section = driver.find_element_by_css_selector("div[class=\"footer\"]")

#             link_list2 = driver.find_elements_by_css_selector("div[class=\"news-item\"]")
            link_list = driver.find_element_by_class_name("page1").find_elements_by_class_name("row")
#             link_list = link_list + link_list2
#                 link_list = main_section.find_elements_by_tag_name("h2")
            
            print len(link_list)
    
            for elem in link_list:
                title = elem.find_element_by_class_name("list-tt").find_element_by_tag_name("a").text # article title
                if title not in list:
                    LOGGER.debug("article title %s"%(title))
                    url = elem.find_element_by_class_name("list-tt").find_element_by_tag_name("a").get_attribute("href")
                    LOGGER.info("url:%s"%(url))
    
                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:
                        
                        abstract = elem.find_element_by_class_name("f4").text
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)
                        self.publishMsg(msg)
    
                    else: # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)
                        
                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
コード例 #7
0
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            #             driver = webdriver.Firefox()
            LOGGER.debug("start extractor from %s" % (self.url, ))
            driver.get(self.url)

            list = []  #extract url list

            stop_flag = True  #
            republishdThre = 5  #find 5 duplicated article stop extractor urls
            republishedCount = 0

            js = "var q=document.documentElement.scrollTop=8000"
            driver.execute_script(js)
            driver.implicitly_wait(0)

            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            driver.implicitly_wait(0)

            link_list = driver.find_element_by_class_name(
                "list").find_elements_by_class_name("item")
            print len(link_list)

            for elem in link_list:
                title = elem.find_element_by_tag_name(
                    "h1").find_element_by_tag_name("a").text  # article title
                if title not in list:
                    LOGGER.debug("article title %s" % (title))
                    url = elem.find_element_by_tag_name(
                        "h1").find_element_by_tag_name("a").get_attribute(
                            "href")
                    LOGGER.info("url:%s" % (url))

                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:

                        abstract = elem.find_element_by_tag_name("p").text
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag,
                                             title, abstract)
                        self.publishMsg(msg)

                    else:  # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)

                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
コード例 #8
0
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" %(self.url, ))
            driver.get(self.url)
            
            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)

            list = [] #extract url list
            
            i = 0 #page count
            stop_flag = True #
            republishdThre = 5 #find 5 duplicated article stop extractor urls
            republishedCount = 0


            # find the article title section
#                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
            # find the article titles
            link_list = driver.find_element_by_class_name("fallsFlow").find_elements_by_css_selector("li[class=\"item masonry-brick\"]")

            for elem in link_list:
                article = elem.find_element_by_tag_name("h3")
                title = article.text # article title
                if title not in list:
                    LOGGER.debug("article title %s"%(title))
    #                         print title
                        
                    url = article.find_element_by_tag_name("a").get_attribute("href")
                    LOGGER.info("url:%s"%(url))
    
                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:
                        
                        abstract = elem.find_element_by_tag_name("h5").text
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)

                        self.publishMsg(msg)

                    else: # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)
                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
コード例 #9
0
 def run(self):
     """
     """
     self.mysql_client = MysqlClient()
     self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)
     self.threhold = 5
     self.page = 14
     LOGGER.info("start re extractor the published url if count() < %s"%(self.threhold, ))
     failed_count = 0
     try:
         failed_count = self.mysql_client.getOne("select count(*) as c from published_url where count < %s", (self.threhold, ) )
     except Exception, e:
         LOGGER.error("failed to load the published url count")
         LOGGER.error(traceback.format_exc())
コード例 #10
0
 def run(self):
     """
     """
     self.mysql_client = MysqlClient()
     self.news_publisher = NewsPublisher(NEWS_URL_QUEUE)
     self.threhold = 5
     self.page = 14
     LOGGER.info("start re extractor the failed url if count() < %s" %
                 (self.threhold, ))
     failed_count = 0
     try:
         failed_count = self.mysql_client.getOne(
             "select count(*) as c from failed_url where count < %s",
             (self.threhold, ))
     except Exception, e:
         LOGGER.error("failed to load the failed url count")
         LOGGER.error(traceback.format_exc())
コード例 #11
0
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" % (self.url,))
            driver.get(self.url)

            # scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            #             print driver.find_element_by_tag_name("body").text

            list = []  # extract url list

            i = 0  # page count
            stop_flag = True  #
            republishdThre = 5  # find 5 duplicated article stop extractor urls
            republishedCount = 0

            while i < 10 and stop_flag:

                # find the article title section
                #                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
                # find the article titles
                contents = driver.find_elements_by_class_name("list_f14d")
                for content in contents:
                    link_list = content.find_elements_by_tag_name("li")

                    for elem in link_list:
                        hrefs = elem.find_elements_by_tag_name("a")
                        title = hrefs[0].text  # article title
                        if title not in list:
                            LOGGER.debug("article title %s" % (title))

                            url = hrefs[0].get_attribute("href")
                            LOGGER.info("url:%s" % (url))

                            url_is_exists = self.isPublished(url)
                            if url_is_exists is False:

                                #                             abstract = elem.find_element_by_class_name("item-Text").text
                                abstract = ""
                                # published the url msg to mq
                                msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)
                                self.publishMsg(msg)

                            else:  # else the remain urls were already published
                                republishedCount += 1
                                if republishedCount >= republishdThre:
                                    stop_flag = False
                                    break
                            list.append(title)
                        else:
                            continue

                # load the next page
                next_page = driver.find_element_by_class_name("pages").find_elements_by_tag_name("a")[-1]
                next_page.click()
                driver.implicitly_wait(5)
                #                 print driver.find_element_by_tag_name("body").text
                i += 1

        except Exception, e:
            LOGGER.error(traceback.format_exc())
コード例 #12
0
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" %(self.url, ))
            driver.get(self.url)
            
            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)

            list = [] #extract url list
            
            i = 0 #page count
            stop_flag = True #
            republishdThre = 5 #find 5 duplicated article stop extractor urls
            republishedCount = 0
            
#             next = 0
#             while next < 2:
#                 #click the next button three times to get the full article list
#                 
#                 next_page = driver.find_element_by_class_name("HomeMore")
#                 next_page_a = next_page.find_element_by_tag_name("a")
#                 print next_page.text
#                 next_page_a.click()
#                 driver.implicitly_wait(5)
#                 next += 1
                
            while i < 3 and stop_flag:

                # find the article title section
#                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
                # find the article titles
                link_list = driver.find_elements_by_css_selector("ul[class=\"pictxt block\"]")[i].find_elements_by_tag_name("li")
    
                for elem in link_list:
                    article = elem.find_element_by_class_name("tit")
                    title = article.text # article title
                    if title not in list:
                        LOGGER.debug("article title %s"%(title))
#                         print title
                        
                        url = article.find_element_by_tag_name("a").get_attribute("href")
                        LOGGER.info("url:%s"%(url))

                        url_is_exists = self.isPublished(url)
                        if url_is_exists is False:
                            
#                             abstract = elem.find_element_by_class_name("item-Text").text
                            abstract = elem.find_element_by_class_name("txt").text
                            # published the url msg to mq
                            msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)

                            self.publishMsg(msg)

                        else: # else the remain urls were already published
                            republishedCount += 1
                            if republishedCount >= republishdThre:
                                stop_flag = False
                                break
                        list.append(title)
                    else:
                        continue

                # load the next page
                next_page = driver.find_element_by_class_name("HomeMore").find_element_by_tag_name("a")
                next_page.click()
                driver.implicitly_wait(5)
                i += 1

        except Exception, e:
            LOGGER.error(traceback.format_exc())
コード例 #13
0
class NetEaseExtractorPlay(BaseExtractor):
    def __init__(self, config):
        super(NetEaseExtractorPlay, self).__init__(config)

    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            #             driver.set_page_load_timeout(10)
            LOGGER.debug("start extractor from %s" % (self.url, ))
            try:
                driver.get(self.url)
            except Exception, e:
                pass

            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)

            list = []  #extract url list

            i = 0  #page count
            stop_flag = True  #
            republishdThre = 5  #find 5 duplicated article stop extractor urls
            republishedCount = 0

            while i < 10 and stop_flag:

                # find the article title section
                #                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
                # find the article titles
                link_list = driver.find_elements_by_css_selector(
                    "div[class=\"m-collist clearfix\"]")

                for elem in link_list:
                    article = elem.find_element_by_tag_name("dt")
                    title = article.text  # article title
                    if title not in list:
                        LOGGER.debug("article title %s" % (title))
                        #                         print title

                        url = elem.find_element_by_tag_name("a").get_attribute(
                            "href")
                        LOGGER.info("url:%s" % (url))

                        url_is_exists = self.isPublished(url)
                        if url_is_exists is False:

                            #                             abstract = elem.find_element_by_class_name("item-Text").text
                            abstract = elem.find_element_by_tag_name("dd").text
                            # published the url msg to mq
                            msg = self.formatMsg(url, self.tag, self.sub_tag,
                                                 title, abstract)
                            self.publishMsg(msg)

                        else:  # else the remain urls were already published
                            republishedCount += 1
                            if republishedCount >= republishdThre:
                                stop_flag = False
                                break
                        list.append(title)
                    else:
                        continue

                # load the next page
                next_page = driver.find_element_by_class_name("m-collist-more")
                next_page.click()
                driver.implicitly_wait(5)
                i += 1

        except Exception, e:
            LOGGER.error(traceback.format_exc())
コード例 #14
0
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" %(self.url, ))
            driver.get(self.url)
            
            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
#             print driver.find_element_by_tag_name("body").text

            list = [] #extract url list
            
            i = 0 #page count
            stop_flag = True #
            republishdThre = 5 #find 5 duplicated article stop extractor urls
            republishedCount = 0

            while i < 10 and stop_flag:

                # find the article title section
#                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
                # find the article titles
                contents = driver.find_elements_by_class_name("list_txt")
                for content in contents:
                    link_list = content.find_elements_by_tag_name("li")
 
                    for elem in link_list:
                        hrefs = elem.find_elements_by_tag_name("a")
                        title = hrefs[1].text # article title
                        if title not in list:
                            LOGGER.debug("article title %s"%(title))
#                             print title
                            
                            url = hrefs[1].get_attribute("href")
                            LOGGER.info("url:%s"%(url))
    
                            url_is_exists = self.isPublished(url)
                            if url_is_exists is False:
                                
    #                             abstract = elem.find_element_by_class_name("item-Text").text
                                abstract = ""
                                # published the url msg to mq
                                msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)
                                self.publishMsg(msg)
    
                            else: # else the remain urls were already published
                                republishedCount += 1
                                if republishedCount >= republishdThre:
                                    stop_flag = False
                                    break
                            list.append(title)
                        else:
                            continue

                # load the next page
                next_page = driver.find_elements_by_class_name("bar_pages_flip")[1]
                next_page.click()
                driver.implicitly_wait(5)
#                 print driver.find_element_by_tag_name("body").text
                i += 1

        except Exception, e:
            LOGGER.error(traceback.format_exc())
コード例 #15
0
          "extractor": HuanQiuExtractor,
          "tag":"huanqiu",
          "sub_tag":"world",
          "period": 10
          },   
     ]


class Extractor(Daemon):
    
    def __init__(self, pidfile, stdin=os.devnull, stdout=os.devnull, stderr=os.devnull ):
        super(Extractor, self).__init__(pidfile , stdin, stdout, stderr)
    
    def run(self):
        """
        run the extractor use dict
        """
        try:
            LOGGER.debug("start the extractor")
            for elem in extractor_source_url_config:
                extractor = elem["extractor"](elem)
                extractor.extract_links()
        except Exception, e:
            LOGGER.error(traceback.format_exc())
        finally:
            LOGGER.info("finished extractor")

if __name__ == '__main__':
#     daemon_main(Extractor, './ ', sys.argv)
    extractor = Extractor("./")
    extractor.run()
コード例 #16
0
    def process(self, msg):
        now = int(time.time())
        msg["id"] = time.time()

        if self.handlerRepository.process(self.message_key, msg):
            LOGGER.info('[Publish %d] received message: %s %s'   % (os.getpid(), self.message_key,msg))
コード例 #17
0
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            #             driver = webdriver.Firefox()
            LOGGER.debug("start extractor from %s" % (self.url, ))
            driver.get(self.url)
            f = file("html", "w")
            source = driver.page_source.encode("utf-8")
            f.write(source)
            f.close()

            list = []  #extract url list

            stop_flag = True  #
            republishdThre = 5  #find 5 duplicated article stop extractor urls
            republishedCount = 0

            js = "var q=document.documentElement.scrollTop=8000"
            driver.execute_script(js)
            driver.implicitly_wait(0)

            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            driver.implicitly_wait(0)

            #             main_section = driver.find_element_by_css_selector("div[class=\"footer\"]")

            #             link_list2 = driver.find_elements_by_css_selector("div[class=\"news-item\"]")
            link_list = driver.find_element_by_class_name(
                "d_list_txt").find_elements_by_tag_name("li")
            #             link_list = link_list + link_list2
            #                 link_list = main_section.find_elements_by_tag_name("h2")

            print len(link_list)

            for elem in link_list:
                title = elem.text  # article title
                if title not in list:
                    LOGGER.debug("article title %s" % (title))
                    url = elem.find_element_by_class_name(
                        "c_tit").find_element_by_tag_name("a").get_attribute(
                            "href")
                    LOGGER.info("url:%s" % (url))

                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:

                        #                             abstract = elem.find_element_by_class_name("item-Text").text
                        abstract = ""
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag,
                                             title, abstract)
                        self.publishMsg(msg)

                    else:  # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)

                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
コード例 #18
0
    },
]


class Extractor(Daemon):
    def __init__(self,
                 pidfile,
                 stdin=os.devnull,
                 stdout=os.devnull,
                 stderr=os.devnull):
        super(Extractor, self).__init__(pidfile, stdin, stdout, stderr)

    def run(self):
        """
        run the extractor use dict
        """
        try:
            LOGGER.debug("start the extractor")
            for elem in extractor_source_url_config:
                extractor = elem["extractor"](elem)
                extractor.extract_links()
        except Exception, e:
            LOGGER.error(traceback.format_exc())
        finally:
            LOGGER.info("finished extractor")


if __name__ == '__main__':
    #     daemon_main(Extractor, './ ', sys.argv)
    extractor = Extractor("./")
    extractor.run()