def download_article_file(articleURL, articleFileDirectory, code):
	articleFilePath = articleFileDirectory + code
				
	# Download the article and save as file
	if (articleURL == ""):
		print "ERROR: Empty URL detected! File not created"
		return None
	else:
		# If a directory for files doesn't exist, create it
		dir = os.path.dirname(articleFileDirectory)

		if not os.path.isdir(dir):
			#print "Created directory: " + dir
			os.makedirs(dir)
		
		try:
			#fullArticle = urllib2.urlopen(articleURL)
			#fullArticleText = fullArticle.read()

			# Use boilerpipe to remove boilerplate and formatting
			extractor = Extractor(extractor='ArticleExtractor', url=articleURL)
			fullArticleText = extractor.getText()

			# Test to see if article is in English. If not, then return None
			top_language = cld.detect(fullArticleText.encode('utf-8'))[0]
			if (top_language != 'ENGLISH'):
				print "SKIPPED: Article is in " + top_language
				return None

			outfile = open(articleFilePath, 'w+')			
			outfile.write(fullArticleText.encode('ascii', 'ignore'))
			outfile.close

			# Use lxml's HTML cleaner to remove markup
			#htmltree = lxml.html.fromstring(fullArticleText)		
			#cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=True)
			#cleaned_tree = cleaner.clean_html(htmltree)
			#return cleaned_tree.text_content()
			return fullArticleText
	

		except urllib2.HTTPError:
			print "ERROR: HTTPError. Article file download skipped: " + articleURL	
			return None

		except urllib2.URLError:
			print "ERROR: URLError. Article file download skipped: " + articleURL	
			return None

		except LookupError:
			print "ERROR: LookupError. Article file download skipped: " + articleURL	
			return None
		
		except UnicodeDecodeError:
			print "ERROR: UnicodeDecodeError. Article file download skipped: " + articleURL
			return None

		except:
	                print "ERROR: ", sys.exc_info()[0]
        	        return None
Exemple #2
0
 def process_item(self, html_page):
     try:
         publish_date = examine(html_page['html'])
         from boilerpipe.extract import Extractor
         extractor = Extractor(extractor='ArticleExtractor',
                               html=html_page['html'])
         body = str(extractor.getText())
         title = str(extractor.source.getTitle())
         art = {
             'title': title,
             'body': body,
             'lang': self.lang,
             'source': html_page['source'],
             'url': html_page['url'],
             'crawl_date': html_page['timestamp'],
             'publish_date': publish_date,
             'article_id': sha1(html_page['url'].encode('utf-8')).hexdigest(),
             'sentences': []
         }
         if self.art_ok(art['body']):
             content = art['body']
             content = content.replace(u'\xa0', u' ')
             content = content.replace('\\n', '\n')
             sents = []
             if self.lang == 'en':
                 sents = sent_tokenize(content)
             else:
                 for para in content.split('\n'):
                     sents += sentence_split(para, self.lang)
                 sents = [sent for sent in sents if self.check_sent(sent)]
             art['sentences'] = sents
             if len(sents) >= 3:
                 self.output_corpus.add_instance(art)
     except Exception as e:
         pass
def get_text(url):
    from boilerpipe.extract import Extractor
    try :
        extractor = Extractor(extractor='DefaultExtractor', url=url)
        return extractor.getText(), extractor.getHTML()
    except:
        return "",""
Exemple #4
0
def scrape(file, split1, split2, urlName):
    links_from_RSS_feed = []
    Requests_from_RSS = requests.get(
        'http://feeds.reuters.com/reuters/businessNews')
    Rss_soup = BeautifulSoup(Requests_from_RSS.text, "html5lib")

    lFile = open(file, "r")
    usedLinks = [line.strip() for line in lFile]
    lFile.close()

    for link in Rss_soup.find_all('guid'):
        links_from_RSS_feed.append(
            str(link.getText().replace('?feedType=RSS&feedName=businessNews',
                                       '')))

    l_file = open(file, "w")
    for item in links_from_RSS_feed:
        l_file.write(str(item) + "\n")
    l_file.close()

    no_of_links = len(links_from_RSS_feed)

    for i in range(0, no_of_links):
        fileName = links_from_RSS_feed[i].rsplit('/', split1)[split2]
        extractedText = Extractor(extractor='ArticleExtractor',
                                  url=urlName + fileName)
        print(fileName)
        write_file = open("Data/" + str(i) + ".txt", "w")
        write_file.write(str(datetime.date.today()) + "\n")
        write_file.write(str(extractedText.getText().encode("utf-8")))
        write_file.close()
    return no_of_links
def extract_blog_posts(url_string, PAGES = 48):
    blog_posts = []
    page_count = 0
    
    while(page_count<=PAGES):
        page_count+=1
        url = url_string.format(page_count) # create url
        driver.get(url)
        
        try:        
            article = driver.find_elements_by_tag_name('article')        
            articles_size = len(article)
            print 'processing ', url
        except SocketError as e:
            if e.errno != errno.ECONNRESET:
                raise # Not error we are looking for
            continue
            
        for i in xrange(articles_size):
            headers = article[i].find_elements_by_tag_name("header")
        for header in headers:
            article_a = header.find_elements_by_xpath("//h1/a[@title]")
        print 'extracting ...'             
        for e in article_a:
            extractor = Extractor(extractor = 'ArticleExtractor', url = e.get_attribute('href'))
            texts = extractor.getText()    
            
            blog_posts.append({'title': e.text, 'content': clean_html(texts), 'link': e.get_attribute('href')})
            return blog_posts
Exemple #6
0
 def scrap_link_boilerpipe(url):
     try:
         extractor = Extractor(extractor='ArticleSentencesExtractor',
                               url=url)
         return extractor.getText()
     except:
         return False
Exemple #7
0
def articles_from_feed():
    articles = []

    feed = feedparser.parse(rss_fakt)
    for item in feed["items"]:
        url = convert_url(item["link"])
        print item["published"]
        print url
        try:
            extractor = Extractor(extractor="ArticleExtractor", url=url)

            date = email.utils.parsedate_tz(item["published"])
            timestamp = email.utils.mktime_tz(date)
            iso = datetime.datetime.utcfromtimestamp(timestamp).isoformat()
            filename = url.split(",")[-1].split(".")[0]

            data = {
                "text": extractor.getText(),
                "date": iso,
                "url": url,
                "filename": filename
            }
        except Exception as e:
            print "Error downloading article from " + url
        articles.append(data)
    return articles
Exemple #8
0
def scrape(feed, used, excep, split1, split2, urlName, nameF):
    arrLinks = []
    req = requests.get('http://feeds.reuters.com/reuters/businessNews')
    soupRss = BeautifulSoup(req.text, "html5lib")
    # Checks list of already queried links
    logrFile = open(used, "r")
    usedLinks = [line.strip() for line in logrFile]
    logrFile.close()
    # Extracts links from inital feed, excluding non-news
    for link in soupRss.find_all('guid'):
        arrLinks.append(
            str(link.getText().replace('?feedType=RSS&feedName=businessNews',
                                       '')))
    # Store currently extracted links as not to repeat
    log_file = open(used, "w")
    for item in arrLinks:
        log_file.write(str(item) + "\n")
    log_file.close()
    # Extracts stripped news content with timestamp, omitting used links
    for item in arrLinks:
        fileName = str(item.rsplit('/', split1)[split2])
        if any(fileName in s for s in usedLinks):
            print fileName + " has been extracted."
        else:
            extractedText = Extractor(extractor='ArticleExtractor',
                                      url=urlName + fileName)
            print fileName + ": New"
            write_file = open("extractedFiles/" + nameF + fileName + ".txt",
                              "w")
            write_file.write(str(datetime.date.today()) + "\n")
            write_file.write(str(extractedText.getText().encode("utf-8")))
            write_file.close()
Exemple #9
0
def extract(args):
    if not os.path.isfile("articles.json"):
        print "File articles.json does not exist"
        print "Have you already crawled?"
        exit()

    with open("articles.json") as article_list:
        articles = [
            json.loads(line) for line in article_list.read().splitlines()
        ]

    for article in articles:
        if args.html:
            with open(article['path'], "rb") as html:
                extractor = Extractor(extractor='ArticleExtractor',
                                      html=html.read())
        else:
            extractor = Extractor(extractor='ArticleExtractor',
                                  url=article['url'])

        dirname = os.path.join("articles", article['domain']) + "/text"
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        filename = sha1(article['url']).hexdigest() + '.txt'
        path = os.path.join(dirname, filename)

        with open(path, "wb+") as extracted_text:
            extracted_text.write(extractor.getText().encode("utf-8"))
def extract_article(url):
  r = requests.get(url)
  
  # the the url exists, continue
  if r.status_code == 200:
    
    # extract and parse response url
    url = parse_url(r.url)

    # extract html
    html = r.content.decode('utf-8', errors='ignore')

    # run boilerpipe
    BP = Extractor(html=html)

    # run readability
    Rdb = Document(html)

    html = Rdb.summary()
    # return article data
    return {
      'extracted_title': Rdb.short_title().strip(),
      'extracted_content': strip_tags(BP.getText()),
    }

  # otherwise return an empty dict
  else:
    return {}
    def extract_body_with_boilerpipe(html):
        """
        Extractor types:
                DefaultExtractor
                ArticleExtractor
                ArticleSentencesExtractor
                KeepEverythingExtractor
                KeepEverythingWithMinKWordsExtractor
                LargestContentExtractor
                NumWordsRulesExtractor
                CanolaExtractor
        Reference: https://github.com/misja/python-boilerpipe
        Note: set JAVA_HOME if import fails

        Returns
        --------
        str: extracted body text. Return empty string if extraction fails
        """
        try:
            extractor = Extractor(extractor='KeepEverythingExtractor',
                                  html=html)
            extracted_text = extractor.getText()
        except:
            print "Failed to extract text with boilerpipe"
            extracted_text = ""

        return extracted_text
Exemple #12
0
def extract_and_save(url, path):
	try:
		handle = urllib2.urlopen(url)
		html_content = handle.read()
		extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
		text = extractor.getText()
		if text:
			if detect_english(text):
				links = get_all_urls(html_content, url)
				for link in links:
					try:
						handle = urllib2.urlopen(url)
						html_content = handle.read()
						#extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)		
						#text_content = extractor.getText()
						#if text_content:
						#	if detect_english(text_content):
						encoded_url = encode(link)
						f = open(path + "/" + encoded_url, "w")
						f.write(html_content)
						f.close()
					except:
						print url
						traceback.print_exc()
						return None
	except:
		print url
		traceback.print_exc()
		return None
	def run(self):
		count = 0
		docCount = self.doc_cursor.count()
		for doc in self.doc_cursor:
			url = doc['url']
			if (self.keepText(url)):
				try:
					extractor = Extractor(extractor='ArticleExtractor', url=url)
					extracted_text = extractor.getText()
				
					if (len(extracted_text) > 0):
						title = extractor.getTitle()
						
						if title != None:
							doc['title'] = title
							doc['extracted_text'] = title + " " + extracted_text
						else:
							doc['extracted_text'] = extracted_text
						self.db_collection.save(doc)
						print 'OK -' + url
				except IOError, err:
					print "IOError with url " + url
					print str(err)
				except (LookupError):
					print "LookupError - Maybe not text or weird encoding " + url
				except (UnicodeDecodeError, UnicodeEncodeError):
					print "UnicodeDecodeError or UnicodeEncodeError- " + url
Exemple #14
0
def extract_article_content(html, url):
    """
    Disclaimer
    ----------
    Copied from
    https://github.com/turi-code/how-to/blob/master/
            extract_article_content_from_HTML.py
    Description
    ----------
    Extract the primary textual content from an HTML news article.
    In many cases, the HTML source of news articles is littered with
    boilerplate text that you would not want to include when doing text
    analysis on the content the page. Even if you could write some rules to
    extract the content from one page, it's unlikely that those rules would
    apply to an article from another site. The boilerpipe module allows us to
    solve this problem more generally.
    Parameters
    ----------
    html : str
        The source HTML from which to extract the content.
    url : str
        The url, needed for logging purposes only
    Returns
    -------
    out : str
        The primary content of the page with all HTML and boilerplate text
        removed.
    Examples
    --------
    >>> extract_article_content(
            "<html><body><p>Turi is in the business of building the best " \
            "machine learning platform on the planet. Our goal is to make " \
            "it easy for data scientists to build intelligent, predictive " \
            "applications quickly and at scale. Given the perplexing array " \
            "of tools in this space, we often get asked "Why Turi? What " \
            "differentiates it from tools X, Y, and Z?" This blog post aims " \
            "to provide some answers. I’ll go into some technical details " \
            "about the challenges of building a predictive application, and " \
            "how Turi’s ML platform can help.</p></body></html>")
    >>> Turi is in the business of building the best " \
            "machine learning platform on the planet. Our goal is to make " \
            "it easy for data scientists to build intelligent, predictive " \
            "applications quickly and at scale. Given the perplexing array " \
            "of tools in this space, we often get asked "Why Turi? What " \
            "differentiates it from tools X, Y, and Z?" This blog post aims " \
            "to provide some answers. I’ll go into some technical details " \
            "about the challenges of building a predictive application, and " \
            "how Turi’s ML platform can help.
    See Also
    --------
    - `Boilerpipe project <https://code.google.com/p/boilerpipe/>`_
    - `Boilerpipe Python module <https://pypi.python.org/pypi/boilerpipe>`_
    """
    from boilerpipe.extract import Extractor
    if html and html.strip():
        try:
            extractor = Extractor(extractor='ArticleExtractor', html=html)
            return extractor.getText()
        except Exception as e:
            error = "Function extract_article_content: " + url + " - " + str(e)
 def process_text(self, text):
     if text == "":
         return text
     extractor = Extractor(extractor='ArticleExtractor',
                           html=text)
     new_val = extractor.getText()
     return new_val
Exemple #16
0
def scrape(feed, used, excep, split1, split2, urlName, nameF):
	arrLinks = []
	req = requests.get('http://feeds.reuters.com/reuters/businessNews')
	soupRss = BeautifulSoup(req.text, "html5lib")

	logrFile = open(used,"r")
	usedLinks = [line.strip() for line in logrFile]
	logrFile.close()

	for link in soupRss.find_all('guid'):
		arrLinks.append(str(link.getText().replace('?feedType=RSS&feedName=businessNews', '')))

	log_file = open(used,"w")
	for item in arrLinks:
		log_file.write(str(item)+"\n")
	log_file.close()

	for i in range(0, 8):
		fileName = arrLinks[i].rsplit('/', split1)[split2]
		#if any(fileName in s for s in usedLinks):
		#	print fileName +" has been extracted."
		#else:
		extractedText = Extractor(extractor='ArticleExtractor', url=urlName+fileName)
		print fileName
		write_file = open("Data/"+str(i)+".txt","w")
		write_file.write(str(datetime.date.today()) + "\n")
		write_file.write(str(extractedText.getText().encode("utf-8")))
		write_file.close()
Exemple #17
0
def get_text_boilerpipe(html_text):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html_text)
        return extractor.getText()
    except:
        print "Exception"
        return None
	def parse_page(self, response):
		if response.meta.has_key('crawldepth'):
			depth = response.meta['crawldepth']
		else:
		#       Set search depth here
			depth = 1
		log.msg('Depth = %s' % str(depth), level=log.INFO)
		if not isinstance(response, HtmlResponse):
		    log.msg('Not an HTML file: %s' % response.url, level=log.WARNING)
		    return

		log.msg('Response from: %s' % response.url, level=log.INFO)
		url_bf.add(response.url)
	
		# TODO: Extract page title
	
		extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode())
		cleaned_text = extractor.getText()

		# Eliminate duplicates
		keywordset = set(keywordlist)

		found_list = []
		for keyword in keywordset: # TODO: Is there a more efficient way to do this?
			# Look at word boundaries to match entire words only
			if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)):
				found_list.append(keyword)

		# Parse this page		
		item = BiffleItem()
		if (len(found_list) > 0):
			item['url'] = response.url
			item['body'] = cleaned_text
			item['keywords'] = ', '.join(found_list)
			item['process_date'] = datetime.today()
			log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO)
			self.map_keyword_count(found_list)
			yield item

		if (depth > 0):	
			# Find the next requests and yield those
			hxs = HtmlXPathSelector(response)
			links = hxs.select('//a/@href').extract()
			log.msg('Links on page: %s' % len(links), level=log.INFO)
			depth -= 1
			log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO)
			for l in links:
				l = urlparse.urljoin(response.url, l)
				if (l in url_bf):
					pass
					#log.msg('Duplicate URL found: %s' % l, level=log.INFO)
				else:
					url_bf.add(l)
					#log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO)
					# Decrement depth for next layer of links
					#callback = lambda response, depth = depth: self.parse_page(response, depth)			
					callback = lambda response: self.parse_page(response)
					request = Request(l, callback=callback)
					request.meta['crawldepth'] = depth
					yield request
def extract_article_text(url):
    if url in utils.BROKEN_URLS or any([True for sd in BAD_SUBDOMAINS if sd in url]):
        return ""

    while True:
        try:
            extractor = Extractor(extractor='ArticleExtractor', url=url)
            break
        except socket.timeout:
            print("got socket.timeout on url: {}. retrying...".format(url), file=utils.stddbg)
        except URLError as e:
            if e.reason == "timed out":
                print("got urllib 'timed out' on url {}. retrying...".format(url), file=utils.stddbg)
            elif hasattr(e.reason, "strerror") and e.reason.strerror == 'getaddrinfo failed':
                print("got urllib 'getaddrinfo failed' on url {}. retrying...".format(url), file=utils.stddbg)
            elif e.code == 503:
                print("got urllib 503 error on url {}. retrying...".format(url), file=utils.stddbg)
            else:
                if not hasattr(e, "url"):
                    e.url = url
                raise
        except Exception as e:
            e.url = url
            raise e

    text = str(unicodedata.normalize('NFKD', (str(extractor.getText()))).encode('ascii', 'ignore'))
    return filter_junk(text)
Exemple #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("raw_dir_path")
    parser.add_argument("out_file_path")
    args = parser.parse_args()

    f_names = [(int(f), f) for f in listdir(args.raw_dir_path)]
    f_names = sorted(f_names)
    fout = open(args.out_file_path, 'w')

    for int_f_name, f_name in f_names:
        trec_reader = TrecReader(join(args.raw_dir_path, f_name))
        empty_cnt = 0
        err_cnt = 0

        for docno, html_text in trec_reader:
            if not html_text:
                empty_cnt += 1
            try:
                extractor = Extractor(extractor='ArticleExtractor', html=html_text)
                text = extractor.getText()
                text = text.replace('\n', ' ').replace('\t', ' ')
                text = text.encode('ascii', 'ignore')
                text = text_clean(text)
                if text:
                    fout.write(docno + '\t' + text + '\n')
                else:
                    empty_cnt += 1
            except Exception as e:
                err_cnt += 1

    fout.close()
    print empty_cnt, err_cnt
Exemple #21
0
def extractor(URL):

    extractor = Extractor(extractor='ArticleExtractor', url=URL)

    data = extractor.getText()

    file = open("data.txt", "w")
    file.write(data.encode('UTF-8'))
    file.close()

    #Scinde la contenu en phrase
    with open('data.txt', 'r') as f:
        s = f.read()
        sentences = s.split('.')

    #Liste de mot vide
    w=[]

    #Scinde les phrase en mots
    for sentence in sentences :
        w.extend(sentence.split(' '))

    print w

    #Retourne la liste de Mot
    return w
Exemple #22
0
def post_index(post):
    extractor = Extractor(extractor='ArticleExtractor', url=post['href'])
    post_text = extractor.getText().replace('\n', ' ')
    url = 'http://localhost:9200/bookmarks/bookmark/%s/_create' % post['hash']
    data = '{"title":"%s", "url":"%s", "text":"%s"}' % (post['description'], post['href'], post_text.replace('"', '\\"'))
    r = requests.put(url, data=data)
    print r.status_code
Exemple #23
0
def Text_extractor(y, page, team, team_i, counter=0):
    """Extract the text of team pages using BoilerPipe."""
    try:
        upage = urllib.parse.quote_plus(page)
        url = "http://" + y + ".igem.org/wiki/index.php?title=" + upage
        extractor = Extractor(extractor='ArticleExtractor', url=url)
    except:
        counter += 1
        if counter > 10:
            print("Failed to get the text for page {}".format(page))
            return None
        Text_extractor(y, page, team, team_i, counter=counter)
    f = open(
        'results/%s/%s/%s_-_-_CONTENT.html' %
        (y, team, page.replace('/', '#')), 'w')
    f.write(extractor.getHTML())
    f.close()
    f = open(
        'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')),
        'w')
    f.write(extractor.getText())
    f.close()
    path = 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#'))
    # text = text.replace('\\n', '\\\\n')
    output = '%s\t%s\t%s\t%s\n' % (y, str(teams_id[team_i]), page, path)
    teams_pages_text_db.write(output)
Exemple #24
0
def extract_metadata(url):
    extractor = Extractor(extractor='KeepEverythingExtractor', url=url)
    text = extractor.getText().split("\n")
    author = None
    date = None
    keywords = []
    find_keywords = False
    for line in text:
        #author
        match = re.match("Von\s(\w+\s\w+)(,\s[\s\w]*$|$)", line)
        if match:
            author = match.group(1)
            continue

        #date
        match = re.match("([0-9]{2}\.[0-9]{2}\.[0-9]{4})$", line)
        if match:
            date = match.group(1)
            continue

        #keywords
        if find_keywords:
            match = re.match("Hat\sIhnen\sdieser\sArtikel\sgefallen.*", line)
            if match:
                find_keywords = False
                continue
            else:
                keywords.append(line) 
            
        match = re.match("Schlagwörter zu diesem Artikel:", line)
        if match:
            find_keywords = True
        
    return author, date, keywords
Exemple #25
0
def get_articles(url):
    doc = urllib.request.urlopen(url)
    docContent = BeautifulSoup(doc, 'html.parser')
    articles = []
    for element in docContent.find_all('div'):
        try:
            if element.attrs['style'] == 'width:550px':
                article = defaultdict(str)
                article_link = 'http://www.moneycontrol.com' + element.a['href']
                for p in element.find_all('p'):
                    if 'a_10dgry' in p.attrs['class']:
                        article_time = p.contents[0].split('|')[0]
                        article_date = p.contents[0].split('|')[1][:-1]
                        article['link'] = article_link
                        article['time'] = article_time
                        article['date'] = article_date
                        extractor = Extractor(extractor='ArticleExtractor',
                                              url=article_link)
                        article['content'] = extractor.getText()
                        article['title'] = BeautifulSoup(extractor.getHTML(),
                                                         'html.parser').find_all('h1')[0].contents[0]
                        articles.append(article)
                        break
        except:
            logging.debug('div has no width attribute')
    return articles
def detag_html_file(infile, outfile, id):
    from boilerpipe.extract import Extractor

    if not USE_BOILERPLATE:
        return detag_html_file_bs(infile, outfile, id)

    tempfile = "%s.tmp.html" % (infile,) # boilerplate seems to need an html extension
    try:
        copyfile(infile, tempfile)
        extractor = Extractor(extractor='ArticleExtractor', url="file://"+tempfile)
        os.unlink(tempfile)

        extracted_text = extractor.getText()
        extracted_html = extractor.getHTML()

        soup = BeautifulSoup(extracted_html)
        output = codecs.open(outfile, encoding='utf-8', mode='w')
        output.write(u"<DOC>\n<DOCNO>" + unicode(id) + u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n");
        head = soup.find('head')
        if head:
            title_tag = head.find('title')
            if title_tag and title_tag.string:
                output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') + u"</TITLE>\n")

        extract_para(soup, output)
        output.write(u"</DOC>\n")
        output.close()
    except Exception, exc:
        try:
            os.unlink(tempfile)
        except:
            pass

        return detag_html_file_bs(infile, outfile, id)
Exemple #27
0
 def ParserBoilerEverything(html_object):
     extractor = Extractor(extractor='DefaultExtractor', html=html_object)
     sents = extractor.getText()
     try:
         return sents
     except Exception as e:
         return
Exemple #28
0
 def parse_item(self, response):
     response_news = NewsItem()
     response_news['url'] = response.url
     response_news['html'] = Binary(zlib.compress(response.body, 9))
     extractor = Extractor(extractor='ArticleExtractor', html=response.body)
     response_news['content'] = extractor.getText()
     return response_news
def Process(DocIn,OutName):
    out = open(OutName,'w')
    
    logging.info('reading [%s]', DocIn)
    ErrCnt = 0
    EmptyCnt = 0
    for cnt,line in enumerate(open(DocIn)):
        vCol = line.strip().split('\t')
        DocNo = vCol[0]
        RawHtml = ' '.join(vCol[1:])
        RawHtml = DiscardHTMLHeader(RawHtml)
        if "" == RawHtml:
            EmptyCnt += 1
            continue
        try:
            extractor = Extractor(extractor='ArticleExtractor',html=RawHtml)
            text = extractor.getText()
            text = text.replace('\n',' ').replace('\t',' ')
            text = text.encode('ascii','ignore')
            text = TextClean(text)
            if "" != text:
                print >>out, DocNo + '\t' + text
            else:
                EmptyCnt += 1
#             print DocNo + '\t' + text.encode('ascii','ignore')
        
        except Exception as e:
            ErrCnt += 1
            
        if 0 == (cnt % 100):
            logging.info('parsed [%d] doc [%d] Err [%d] Empty', cnt,ErrCnt,EmptyCnt)

    out.close()
    logging.info('finished [%d] doc [%d] Err', cnt,ErrCnt)
Exemple #30
0
def extract_and_save(url, path):
    try:
        handle = urllib2.urlopen(url)
        html_content = handle.read()
        extractor = Extractor(extractor='KeepEverythingExtractor',
                              html=html_content)
        text = extractor.getText()
        if text:
            if detect_english(text):
                links = get_all_urls(html_content, url)
                for link in links:
                    try:
                        handle = urllib2.urlopen(url)
                        html_content = handle.read()
                        #extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
                        #text_content = extractor.getText()
                        #if text_content:
                        #	if detect_english(text_content):
                        encoded_url = encode(link)
                        f = open(path + "/" + encoded_url, "w")
                        f.write(html_content)
                        f.close()
                    except:
                        print url
                        traceback.print_exc()
                        return None
    except:
        print url
        traceback.print_exc()
        return None
Exemple #31
0
    def extract_main_text(self):
        if self.res is None:
            return None

        extractor = Extractor(  # extractor='ArticleExtractor',
            url=self.url)
        return [extractor.getText()]
    def parse(self, response):
        hxs = Selector(response)
        
        item = ArticleItem()
        item["title"] = hxs.xpath('//title/text()').extract()
        item["link"] = response.url
        item["source"] = hxs.xpath('//p').extract()
        
        extractor = Extractor(extractor='ArticleExtractor', url=item["link"])
        
        source = extractor.getHTML()
        item["text"] = extractor.getText()
        item["html"] = source
        
        page = html.fromstring(source)
        links = page.xpath("//p//a/@href")

        linkPattern = re.compile("^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?\+=&amp;%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&amp;%@!\-\/\(\)]+))?$")
        
        for link in links:
            if linkPattern.match(link) and not link in self.crawled_links:
                self.crawled_links.append(link)
                yield Request(link, self.parse)
        

        yield item
Exemple #33
0
def GOOGLE_get_data(company):

    google_news_rss_url = "https://news.google.com/news/?q=%s&output=rss" % company
    rss_feed = feedparser.parse(google_news_rss_url)

    content_list = list()

    for entry in rss_feed['entries']:
        title = entry['title']
        link = entry['link']
        try:
            news_page = urllib2.urlopen(link).read()
            extractor = Extractor(extractor='ArticleExtractor', html=news_page)
        except:
            continue
        content = extractor.getText()
        now = datetime.datetime.now()
        content_list.append({"title": title,
                            "article": content,
                            "link": link,
                            "source": "GOOGLE",
                            "target": company,
                            "date": "%04d%02d%02d" % (now.year, now.month, now.day),
                            "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()})
                            

    DBOperation.save_db(content_list)
Exemple #34
0
    def dehydrate(self, bundle):
        """GET Method"""
        
        #print bundle.data['content']
        if bundle.data['content']:
            extractor = Extractor(extractor='ArticleExtractor', html=bundle.data['content'])
            bundle.data['content'] = extractor.getText()

        try:
            article_stats = ArticleStat.objects.filter(article_id=bundle.obj.id)
            bundle.data['stat'] = {
                'reads': sum(map(lambda x: x.reads, article_stats)),
                'likes': sum(map(lambda x: x.likes, article_stats)),
                'dislikes': sum(map(lambda x: x.dislikes, article_stats)),
                'shares': sum(map(lambda x: x.shares, article_stats)),
            }
        except ObjectDoesNotExist:
            bundle.data['stat'] = {
                'reads': 0, 
                'likes': 0, 
                'dislikes': 0,
                'shares': 0,
            }

        # no cookies or no sessionid field in cookies, then just send normal
        # newsfeed to anonymous user
        #if not bundle.request.COOKIES or not bundle.request.COOKIES['sessionid']:
        if not bundle.request.COOKIES or not 'sessionid' in bundle.request.COOKIES:
            return bundle

        try:
            # even if there is a cookie, sessionid field might be not exist,
            # then it is also anonymous user
            s = get_current_session(bundle.request.COOKIES['sessionid'])
            if s is None or 'user_id' not in s:
                return bundle

            # get activity information whether user has already
            # read/liked/shared
            activity = Activities.objects.get(user_id=s['user_id'], \
                article_id=bundle.obj.id)

            # assign information 
            bundle.data['activity'] = {
                'read': activity.like or activity.share,
                'like': activity.like,
                'dislike': activity.dislike,
                'share': activity.share
            }
        except ObjectDoesNotExist:
            # assign False if the news has never been opened
            bundle.data['activity'] = {
                'read': False, 
                'like': False, 
                'dislike': False, 
                'share': False
            }

        return bundle
def get_text_boilerpipe(url):
    try:
        extractor = Extractor(extractor='ArticleExtractor', url=url)
        extracted_text = extractor.getText()
    except BaseException as error:
        extracted_text = 'error: {}'.format(error)
        print('error: {}'.format(error))
    return extracted_text
Exemple #36
0
def sentences_from_urls(url: str, extractor_name=EXTRACTORS[0], model=MODELS[0],
                        min_words=0, with_proba=False, return_raw=False):
    extractor = Extractor(extractor=extractor_name)
    model = models[model]
    extracted_text = extractor.getTextBlocks(url=url)
    if len(extracted_text) > 0:
        func = model.predict_proba if with_proba else model.predict
        return func(extracted_text, min_words=min_words, return_raw=return_raw)
def html2text_bp(html):
    text = None
    try:
        extractor = Extractor(extractor=extractor_type, html=html)
        text = extractor.getText()
    except:
        traceback.print_exc()
    return text
Exemple #38
0
def extract_text(html_content):
  try:
    extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
    #print extractor.getText()
    return extractor.getText()
  except:
    print "Exception in html extraction"
    return None
Exemple #39
0
def extract_html(html_text, parser):
    try:
        extractor = Extractor(extractor=parser, html=html_text)
    except Exception as e:
        return None, None
    title = extractor.source.getTitle()
    body_text = extractor.getText()
    return title, body_text
Exemple #40
0
def saveHtml(url, page):
    extractor = Extractor(extractor='ArticleExtractor', html=page)
    processed_plaintext = extractor.getText()
    # print processed_plaintext
    fileName = "./doc/" + (url + ".txt").replace("/", "()")
    f = open(fileName, "w")
    f.write(processed_plaintext)
    f.close()
Exemple #41
0
 def ParserBoilerDefault(html_object):
     extractor = Extractor(extractor='DefaultExtractor', html=html_object)
     sents = extractor.getText()
     try:
         sents = list(nlp(sents).sents)
         return sents
     except Exception as e:
         return
Exemple #42
0
 def boilerpipe_text(cls,url_in=None,html_in=None,extractor='ArticleExtractor'):
     assert (url_in!=None) != (html_in!=None) # one, not both
     inp=url_in or html_in
     if url_in:
         extractor = Extractor(extractor=extractor, url=inp)
     else:
         extractor = Extractor(extractor=extractor, html=inp)
     return HtmlTextCleaner().spec_text_cleaner(extractor.getText())
def getBoilerPlate(url):
    #url = 'http://cnn.com/2016/07/17/health/south-africa-meerkat-telescope-galaxies/index.html'
    try:
        extractor = Extractor(extractor='ArticleExtractor', url=url)
        extracted_text = extractor.getText().replace('\n', '')
        return '', extracted_text
    except Exception, e:
        return '', ''
Exemple #44
0
 def ParserBoilerArticle(html_object):
     extractor = Extractor(extractor='ArticleSentencesExtractor', html=html_object)
     sents = extractor.getText()
     try:
         sents = list(nlp(sents).sents)
         return sents
     except Exception as e:
         return
Exemple #45
0
def load_from_html(filename,
                   use_boilerpipe=True,
                   use_nltk=True,
                   use_regex=True,
                   binary=False):
    if binary:
        charset = UnicodeDammit(open(filename, 'rb').read())
        charset = charset.original_encoding
        try:
            content = open(filename, 'r', encoding=charset).read()
        except Exception as e:
            # if has error, return empty results
            logging.warn('encode error: {}, {}'.format(filename, e))
            return {'title': [], 'body': []}
    else:
        content = open(filename, 'r', encoding='utf-8').read()
    start = time.time()
    if not use_regex or not use_boilerpipe:
        bs = BeautifulSoup(content, 'html.parser')
    if use_regex:
        match = re.search(r'<title.*?>(.+?)</title>', content[:5000],
                          re.DOTALL | re.IGNORECASE)
        title = match.group(1) if match else ''
        title = html.unescape(title).strip()
    else:
        if bs.title != None and bs.title.string != None:
            title = bs.title.string.strip()
        else:
            title = ''
    t1 = time.time() - start
    start = time.time()
    if use_boilerpipe:
        extractor = Extractor(extractor='ArticleExtractor',
                              html=content)  # time consuming
        body = extractor.getText()
    else:
        body = bs.select('body')
        if len(body) <= 0:
            body = bs
        else:
            body = body[0]
        # remove all useless label
        [x.extract() for x in body.findAll('script')]
        [x.extract() for x in body.findAll('style')]
        [x.extract() for x in body.findAll('meta')]
        [x.extract() for x in body.findAll('link')]
        body = body.text
    t2 = time.time() - start
    start = time.time()
    result = {
        'title':
        my_word_tokenize(title) if use_nltk else clean_text(title).split(' '),
        'body':
        my_word_tokenize(body) if use_nltk else clean_text(body).split(' '),
    }
    t3 = time.time() - start
    #print('{}\t{}\t{}'.format(t1, t2, t3))
    return result
Exemple #46
0
    def parse_item(self, response):
        title = response.css('title::text').extract_first()

        extractor = Extractor(extractor='ArticleExtractor', html=response.body)

        yield Article(title=title,
                      text=extractor.getText(),
                      url=response.url,
                      field=self.name)
Exemple #47
0
def extract_article(html_text):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html_text)
        text_string = extractor.getText()
        text_string = htmlParser.unescape(text_string)
    except Exception:
        logger.error('Error extracting article html')
        text_string = ''
    return text_string
Exemple #48
0
def get_news_by_url(url):
    print "Come to get_news_by_url"
    article = {}
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))
        "Get the title of News"
        title = ""
        titleElements = soup.findAll(id="disqus_title")
        for ele in titleElements:
            title = ele.getText().encode('utf-8')
        article["title"] = title
        print title

        "Get the posttime of News,Timezone ET"
        postTime = ""
        postTimeElements = soup.findAll(attrs={'class': "datestamp"})
        for ele in postTimeElements:
            timeStamp = float(ele["epoch"])
        postTime = datetime.fromtimestamp(timeStamp / 1000)
        article["post_time"] = postTime

        "Initiate the post date"
        postDay = postTime.date()
        article["post_date"] = postDay

        "Get the author information "
        author = ""
        authorElements = soup.findAll(attrs={'class': "byline"})
        for ele in authorElements:
            author = ele.contents[0].strip().replace("By", "").replace(
                "-", "").replace("and", ",").strip()
        article["author"] = author

        "Get the content of article"
        extractor = Extractor(extractor='ArticleExtractor', url=url)
        content = extractor.getText().encode("utf-8")
        article["content"] = content

        "Initiate the Sources"
        source = "Bloomberg News"
        article["source"] = source

        "Initiate the update_time"
        updateTime = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
        article["update_time"] = updateTime

        "Initiate the embers_id"
        embersId = hashlib.sha1(content).hexdigest()
        article["embers_id"] = embersId

        "settup URL"
        article["url"] = url
    except:
        print "Error: %s" % sys.exc_info()[0]
        article = {}
    finally:
        return article
def get_news_by_url(url):
    print "Come to get_news_by_url"
    article = {}
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))
        "Get the title of News"
        title = ""
        titleElements = soup.findAll(id="disqus_title")
        for ele in titleElements:
            title = ele.getText().encode('utf-8')
        article["title"] = title 
        print title
        
        "Get the posttime of News,Timezone ET"
        postTime = ""
        postTimeElements = soup.findAll(attrs={'class':"datestamp"})
        for ele in postTimeElements:
            timeStamp = float(ele["epoch"])
        postTime = datetime.fromtimestamp(timeStamp/1000)
        article["post_time"] = postTime
        
        "Initiate the post date"
        postDay = postTime.date()
        article["post_date"] = postDay;
        
        "Get the author information "
        author = ""
        authorElements = soup.findAll(attrs={'class':"byline"})
        for ele in authorElements:
            author = ele.contents[0].strip().replace("By","").replace("-","").replace("and", ",").strip();
        article["author"] = author
        
        "Get the content of article"
        extractor=Extractor(extractor='ArticleExtractor',url=url)
        content = extractor.getText().encode("utf-8")
        article["content"] =  content
        
        "Initiate the Sources"
        source = "Bloomberg News"
        article["source"] = source
        
        "Initiate the update_time"
        updateTime = datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
        article["update_time"] = updateTime
        
        "Initiate the embers_id"
        embersId = hashlib.sha1(content).hexdigest()
        article["embers_id"] =  embersId

        "settup URL"
        article["url"] =  url
    except:
        print "Error: %s" %sys.exc_info()[0]
        article = {}
    finally:
        return article
Exemple #50
0
def html_to_text(html):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html)
    except Exception as e:
        logger.exception('\nError extracting text from html. Exception: %s, %s',
                         e.__class__.__name__, e)
        return ''
    text = extractor.getText()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
    return text
Exemple #51
0
def extract_article(html_text):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html_text)
        text_string = extractor.getText()
        text_string = htmlParser.unescape(text_string)
        text_string = unicodedata.normalize('NFKD', text_string).encode('ascii','ignore')
    except Exception:
        print 'Error extracting article html'
        text_string = ''
    return text_string
def test_boilerpipe():
    your_url = "http://stackoverflow.com/questions/9352259/trouble-importing-boilerpipe-in-python"
    extractor = Extractor(extractor='ArticleExtractor', url=your_url)
    extracted_html = extractor.getHTML()
    extracted_text = extractor.getText()

    print '\nfunction: %s ' % inspect.stack()[0][3]
    print 'extracted  html: %i text: %i' % (len(extracted_html), len(extracted_text))
    print ''
    n.assert_greater(len(extracted_text), min_str_length)
Exemple #53
0
	def extract(self, article):
		try:
			extractor = Extractor(extractor='ArticleSentencesExtractor', url=article.url)
		except Exception as e:
			return ''
		article_text = ''
		try:
			article_text = extractor.getText()
		except Exception:
			pass
		return article_text.encode('utf-8')
def main():
  contents = sys.argv[1]
  for url in listdir(contents):
    print url
    with codecs.open(url, "w", encoding="utf-8") as out:
      try:
        html = urlopen(url.replace("{", "/")).read()
        extracted = Extractor(html=html)
        out.write(extracted.getText())
      except HTTPError:
        out.write("")
 def fetch_articles(self):
     greq_gen = (grequests.get(u, headers=self.header,) for u in self.urls)
     responses = grequests.map(greq_gen)
     for i,res in enumerate(responses):
         if res is not None:
             extractor = Extractor(html=res.text)
             self.entries[i]['text'] = extractor.getText()
             if '...' in self.entries[i]['title']:
                 self.entries[i]['title'] = extractor.getTitle()
     
     return True
def boiler():
    from boilerpipe.extract import Extractor
    for i in range(0, 1000):
        input_filename = 'page/' + str(i) + '.txt'
        output_filename = 'boilerpipe/' + str(i) + '.txt'
        input_file = open(input_filename, 'r')
        s = input_file.read()
        input_file.close()
        extractor = Extractor(extractor='ArticleExtractor', html=s.decode('GBK', 'ignore'))
        output_file = open(output_filename, 'wb')
        output_file.write(extractor.getText().encode('utf-8'))
        output_file.close()
Exemple #57
0
    def fetch_info(self):
        """
        boilerpipeの本文抽出は余計な部分まで取得することは少ないけれど、本来より少ないことは多々ある
        """
        urls = self.get_urls()
        got_infos = []
        for url in urls:
            extractor = Extractor(extractor='ArticleExtractor', url=url)
            text = extractor.getText()
            content = requests.get(url).content
            got_infos.append([url, text, content])

        return got_infos