def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" % (self.url, ))
            driver.get(self.url)

            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)

            list = []  #extract url list

            i = 0  #page count
            stop_flag = True  #
            republishdThre = 5  #find 5 duplicated article stop extractor urls
            republishedCount = 0

            # find the article title section
            #                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
            # find the article titles
            link_list = driver.find_element_by_class_name(
                "fallsFlow").find_elements_by_css_selector(
                    "li[class=\"item masonry-brick\"]")

            for elem in link_list:
                article = elem.find_element_by_tag_name("h3")
                title = article.text  # article title
                if title not in list:
                    LOGGER.debug("article title %s" % (title))
                    #                         print title

                    url = article.find_element_by_tag_name("a").get_attribute(
                        "href")
                    LOGGER.info("url:%s" % (url))

                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:

                        abstract = elem.find_element_by_tag_name("h5").text
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag,
                                             title, abstract)

                        self.publishMsg(msg)

                    else:  # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)
                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
#             driver = webdriver.Firefox()
            LOGGER.debug("start extractor from %s" %(self.url, ))
            driver.get(self.url)
            
            list = [] #extract url list
            
            stop_flag = True #
            republishdThre = 5 #find 5 duplicated article stop extractor urls
            republishedCount = 0
         
            js = "var q=document.documentElement.scrollTop=8000"
            driver.execute_script(js)
            driver.implicitly_wait(0)
  
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            driver.implicitly_wait(0)
            
#             main_section = driver.find_element_by_css_selector("div[class=\"footer\"]")

#             link_list2 = driver.find_elements_by_css_selector("div[class=\"news-item\"]")
            link_list = driver.find_element_by_class_name("page1").find_elements_by_class_name("row")
#             link_list = link_list + link_list2
#                 link_list = main_section.find_elements_by_tag_name("h2")
            
            print len(link_list)
    
            for elem in link_list:
                title = elem.find_element_by_class_name("list-tt").find_element_by_tag_name("a").text # article title
                if title not in list:
                    LOGGER.debug("article title %s"%(title))
                    url = elem.find_element_by_class_name("list-tt").find_element_by_tag_name("a").get_attribute("href")
                    LOGGER.info("url:%s"%(url))
    
                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:
                        
                        abstract = elem.find_element_by_class_name("f4").text
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)
                        self.publishMsg(msg)
    
                    else: # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)
                        
                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            #             driver = webdriver.Firefox()
            LOGGER.debug("start extractor from %s" % (self.url, ))
            driver.get(self.url)

            list = []  #extract url list

            stop_flag = True  #
            republishdThre = 5  #find 5 duplicated article stop extractor urls
            republishedCount = 0

            js = "var q=document.documentElement.scrollTop=8000"
            driver.execute_script(js)
            driver.implicitly_wait(0)

            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            driver.implicitly_wait(0)

            link_list = driver.find_element_by_class_name(
                "list").find_elements_by_class_name("item")
            print len(link_list)

            for elem in link_list:
                title = elem.find_element_by_tag_name(
                    "h1").find_element_by_tag_name("a").text  # article title
                if title not in list:
                    LOGGER.debug("article title %s" % (title))
                    url = elem.find_element_by_tag_name(
                        "h1").find_element_by_tag_name("a").get_attribute(
                            "href")
                    LOGGER.info("url:%s" % (url))

                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:

                        abstract = elem.find_element_by_tag_name("p").text
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag,
                                             title, abstract)
                        self.publishMsg(msg)

                    else:  # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)

                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" %(self.url, ))
            driver.get(self.url)
            
            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)

            list = [] #extract url list
            
            i = 0 #page count
            stop_flag = True #
            republishdThre = 5 #find 5 duplicated article stop extractor urls
            republishedCount = 0


            # find the article title section
#                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
            # find the article titles
            link_list = driver.find_element_by_class_name("fallsFlow").find_elements_by_css_selector("li[class=\"item masonry-brick\"]")

            for elem in link_list:
                article = elem.find_element_by_tag_name("h3")
                title = article.text # article title
                if title not in list:
                    LOGGER.debug("article title %s"%(title))
    #                         print title
                        
                    url = article.find_element_by_tag_name("a").get_attribute("href")
                    LOGGER.info("url:%s"%(url))
    
                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:
                        
                        abstract = elem.find_element_by_tag_name("h5").text
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)

                        self.publishMsg(msg)

                    else: # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)
                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())
 def run(self):
     """
     run the extractor use dict
     """
     try:
         LOGGER.debug("start the extractor")
         for elem in extractor_source_url_config:
             extractor = elem["extractor"](elem)
             extractor.extract_links()
     except Exception, e:
         LOGGER.error(traceback.format_exc())
 def run(self):
     """
     run the extractor use dict
     """
     try:
         LOGGER.debug("start the extractor")
         for elem in extractor_source_url_config:
             extractor = elem["extractor"](elem)
             extractor.extract_links()
     except Exception, e:
         LOGGER.error(traceback.format_exc())
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" % (self.url,))
            driver.get(self.url)

            # scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            #             print driver.find_element_by_tag_name("body").text

            list = []  # extract url list

            i = 0  # page count
            stop_flag = True  #
            republishdThre = 5  # find 5 duplicated article stop extractor urls
            republishedCount = 0

            while i < 10 and stop_flag:

                # find the article title section
                #                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
                # find the article titles
                contents = driver.find_elements_by_class_name("list_f14d")
                for content in contents:
                    link_list = content.find_elements_by_tag_name("li")

                    for elem in link_list:
                        hrefs = elem.find_elements_by_tag_name("a")
                        title = hrefs[0].text  # article title
                        if title not in list:
                            LOGGER.debug("article title %s" % (title))

                            url = hrefs[0].get_attribute("href")
                            LOGGER.info("url:%s" % (url))

                            url_is_exists = self.isPublished(url)
                            if url_is_exists is False:

                                #                             abstract = elem.find_element_by_class_name("item-Text").text
                                abstract = ""
                                # published the url msg to mq
                                msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)
                                self.publishMsg(msg)

                            else:  # else the remain urls were already published
                                republishedCount += 1
                                if republishedCount >= republishdThre:
                                    stop_flag = False
                                    break
                            list.append(title)
                        else:
                            continue

                # load the next page
                next_page = driver.find_element_by_class_name("pages").find_elements_by_tag_name("a")[-1]
                next_page.click()
                driver.implicitly_wait(5)
                #                 print driver.find_element_by_tag_name("body").text
                i += 1

        except Exception, e:
            LOGGER.error(traceback.format_exc())
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" %(self.url, ))
            driver.get(self.url)
            
            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)

            list = [] #extract url list
            
            i = 0 #page count
            stop_flag = True #
            republishdThre = 5 #find 5 duplicated article stop extractor urls
            republishedCount = 0
            
#             next = 0
#             while next < 2:
#                 #click the next button three times to get the full article list
#                 
#                 next_page = driver.find_element_by_class_name("HomeMore")
#                 next_page_a = next_page.find_element_by_tag_name("a")
#                 print next_page.text
#                 next_page_a.click()
#                 driver.implicitly_wait(5)
#                 next += 1
                
            while i < 3 and stop_flag:

                # find the article title section
#                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
                # find the article titles
                link_list = driver.find_elements_by_css_selector("ul[class=\"pictxt block\"]")[i].find_elements_by_tag_name("li")
    
                for elem in link_list:
                    article = elem.find_element_by_class_name("tit")
                    title = article.text # article title
                    if title not in list:
                        LOGGER.debug("article title %s"%(title))
#                         print title
                        
                        url = article.find_element_by_tag_name("a").get_attribute("href")
                        LOGGER.info("url:%s"%(url))

                        url_is_exists = self.isPublished(url)
                        if url_is_exists is False:
                            
#                             abstract = elem.find_element_by_class_name("item-Text").text
                            abstract = elem.find_element_by_class_name("txt").text
                            # published the url msg to mq
                            msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)

                            self.publishMsg(msg)

                        else: # else the remain urls were already published
                            republishedCount += 1
                            if republishedCount >= republishdThre:
                                stop_flag = False
                                break
                        list.append(title)
                    else:
                        continue

                # load the next page
                next_page = driver.find_element_by_class_name("HomeMore").find_element_by_tag_name("a")
                next_page.click()
                driver.implicitly_wait(5)
                i += 1

        except Exception, e:
            LOGGER.error(traceback.format_exc())
Beispiel #9
0
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            LOGGER.debug("start extractor from %s" %(self.url, ))
            driver.get(self.url)
            
            #scroll bar set from bottom to top, make the page load all
            js = "var q=document.documentElement.scrollTop=10000"
            driver.execute_script(js)
            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
#             print driver.find_element_by_tag_name("body").text

            list = [] #extract url list
            
            i = 0 #page count
            stop_flag = True #
            republishdThre = 5 #find 5 duplicated article stop extractor urls
            republishedCount = 0

            while i < 10 and stop_flag:

                # find the article title section
#                 link_content = driver.find_element_by_css_selector("div[class=\"tab-con current\"]")
                # find the article titles
                contents = driver.find_elements_by_class_name("list_txt")
                for content in contents:
                    link_list = content.find_elements_by_tag_name("li")
 
                    for elem in link_list:
                        hrefs = elem.find_elements_by_tag_name("a")
                        title = hrefs[1].text # article title
                        if title not in list:
                            LOGGER.debug("article title %s"%(title))
#                             print title
                            
                            url = hrefs[1].get_attribute("href")
                            LOGGER.info("url:%s"%(url))
    
                            url_is_exists = self.isPublished(url)
                            if url_is_exists is False:
                                
    #                             abstract = elem.find_element_by_class_name("item-Text").text
                                abstract = ""
                                # published the url msg to mq
                                msg = self.formatMsg(url, self.tag, self.sub_tag, title, abstract)
                                self.publishMsg(msg)
    
                            else: # else the remain urls were already published
                                republishedCount += 1
                                if republishedCount >= republishdThre:
                                    stop_flag = False
                                    break
                            list.append(title)
                        else:
                            continue

                # load the next page
                next_page = driver.find_elements_by_class_name("bar_pages_flip")[1]
                next_page.click()
                driver.implicitly_wait(5)
#                 print driver.find_element_by_tag_name("body").text
                i += 1

        except Exception, e:
            LOGGER.error(traceback.format_exc())
    def extract_links(self):
        try:
            driver = webdriver.PhantomJS(PHANTOMJS_PATH)
            #             driver = webdriver.Firefox()
            LOGGER.debug("start extractor from %s" % (self.url, ))
            driver.get(self.url)
            f = file("html", "w")
            source = driver.page_source.encode("utf-8")
            f.write(source)
            f.close()

            list = []  #extract url list

            stop_flag = True  #
            republishdThre = 5  #find 5 duplicated article stop extractor urls
            republishedCount = 0

            js = "var q=document.documentElement.scrollTop=8000"
            driver.execute_script(js)
            driver.implicitly_wait(0)

            js = "var q=document.documentElement.scrollTop=0"
            driver.execute_script(js)
            driver.implicitly_wait(0)

            #             main_section = driver.find_element_by_css_selector("div[class=\"footer\"]")

            #             link_list2 = driver.find_elements_by_css_selector("div[class=\"news-item\"]")
            link_list = driver.find_element_by_class_name(
                "d_list_txt").find_elements_by_tag_name("li")
            #             link_list = link_list + link_list2
            #                 link_list = main_section.find_elements_by_tag_name("h2")

            print len(link_list)

            for elem in link_list:
                title = elem.text  # article title
                if title not in list:
                    LOGGER.debug("article title %s" % (title))
                    url = elem.find_element_by_class_name(
                        "c_tit").find_element_by_tag_name("a").get_attribute(
                            "href")
                    LOGGER.info("url:%s" % (url))

                    url_is_exists = self.isPublished(url)
                    if url_is_exists is False:

                        #                             abstract = elem.find_element_by_class_name("item-Text").text
                        abstract = ""
                        # published the url msg to mq
                        msg = self.formatMsg(url, self.tag, self.sub_tag,
                                             title, abstract)
                        self.publishMsg(msg)

                    else:  # else the remain urls were already published
                        republishedCount += 1
                        if republishedCount >= republishdThre:
                            stop_flag = False
                            break
                    list.append(title)

                else:
                    continue

        except Exception, e:
            LOGGER.error(traceback.format_exc())