def download_article_file(articleURL, articleFileDirectory, code): articleFilePath = articleFileDirectory + code # Download the article and save as file if (articleURL == ""): print "ERROR: Empty URL detected! File not created" return None else: # If a directory for files doesn't exist, create it dir = os.path.dirname(articleFileDirectory) if not os.path.isdir(dir): #print "Created directory: " + dir os.makedirs(dir) try: #fullArticle = urllib2.urlopen(articleURL) #fullArticleText = fullArticle.read() # Use boilerpipe to remove boilerplate and formatting extractor = Extractor(extractor='ArticleExtractor', url=articleURL) fullArticleText = extractor.getText() # Test to see if article is in English. If not, then return None top_language = cld.detect(fullArticleText.encode('utf-8'))[0] if (top_language != 'ENGLISH'): print "SKIPPED: Article is in " + top_language return None outfile = open(articleFilePath, 'w+') outfile.write(fullArticleText.encode('ascii', 'ignore')) outfile.close # Use lxml's HTML cleaner to remove markup #htmltree = lxml.html.fromstring(fullArticleText) #cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=True) #cleaned_tree = cleaner.clean_html(htmltree) #return cleaned_tree.text_content() return fullArticleText except urllib2.HTTPError: print "ERROR: HTTPError. Article file download skipped: " + articleURL return None except urllib2.URLError: print "ERROR: URLError. Article file download skipped: " + articleURL return None except LookupError: print "ERROR: LookupError. Article file download skipped: " + articleURL return None except UnicodeDecodeError: print "ERROR: UnicodeDecodeError. Article file download skipped: " + articleURL return None except: print "ERROR: ", sys.exc_info()[0] return None
def process_item(self, html_page): try: publish_date = examine(html_page['html']) from boilerpipe.extract import Extractor extractor = Extractor(extractor='ArticleExtractor', html=html_page['html']) body = str(extractor.getText()) title = str(extractor.source.getTitle()) art = { 'title': title, 'body': body, 'lang': self.lang, 'source': html_page['source'], 'url': html_page['url'], 'crawl_date': html_page['timestamp'], 'publish_date': publish_date, 'article_id': sha1(html_page['url'].encode('utf-8')).hexdigest(), 'sentences': [] } if self.art_ok(art['body']): content = art['body'] content = content.replace(u'\xa0', u' ') content = content.replace('\\n', '\n') sents = [] if self.lang == 'en': sents = sent_tokenize(content) else: for para in content.split('\n'): sents += sentence_split(para, self.lang) sents = [sent for sent in sents if self.check_sent(sent)] art['sentences'] = sents if len(sents) >= 3: self.output_corpus.add_instance(art) except Exception as e: pass
def get_text(url): from boilerpipe.extract import Extractor try : extractor = Extractor(extractor='DefaultExtractor', url=url) return extractor.getText(), extractor.getHTML() except: return "",""
def scrape(file, split1, split2, urlName): links_from_RSS_feed = [] Requests_from_RSS = requests.get( 'http://feeds.reuters.com/reuters/businessNews') Rss_soup = BeautifulSoup(Requests_from_RSS.text, "html5lib") lFile = open(file, "r") usedLinks = [line.strip() for line in lFile] lFile.close() for link in Rss_soup.find_all('guid'): links_from_RSS_feed.append( str(link.getText().replace('?feedType=RSS&feedName=businessNews', ''))) l_file = open(file, "w") for item in links_from_RSS_feed: l_file.write(str(item) + "\n") l_file.close() no_of_links = len(links_from_RSS_feed) for i in range(0, no_of_links): fileName = links_from_RSS_feed[i].rsplit('/', split1)[split2] extractedText = Extractor(extractor='ArticleExtractor', url=urlName + fileName) print(fileName) write_file = open("Data/" + str(i) + ".txt", "w") write_file.write(str(datetime.date.today()) + "\n") write_file.write(str(extractedText.getText().encode("utf-8"))) write_file.close() return no_of_links
def extract_blog_posts(url_string, PAGES = 48): blog_posts = [] page_count = 0 while(page_count<=PAGES): page_count+=1 url = url_string.format(page_count) # create url driver.get(url) try: article = driver.find_elements_by_tag_name('article') articles_size = len(article) print 'processing ', url except SocketError as e: if e.errno != errno.ECONNRESET: raise # Not error we are looking for continue for i in xrange(articles_size): headers = article[i].find_elements_by_tag_name("header") for header in headers: article_a = header.find_elements_by_xpath("//h1/a[@title]") print 'extracting ...' for e in article_a: extractor = Extractor(extractor = 'ArticleExtractor', url = e.get_attribute('href')) texts = extractor.getText() blog_posts.append({'title': e.text, 'content': clean_html(texts), 'link': e.get_attribute('href')}) return blog_posts
def scrap_link_boilerpipe(url): try: extractor = Extractor(extractor='ArticleSentencesExtractor', url=url) return extractor.getText() except: return False
def articles_from_feed(): articles = [] feed = feedparser.parse(rss_fakt) for item in feed["items"]: url = convert_url(item["link"]) print item["published"] print url try: extractor = Extractor(extractor="ArticleExtractor", url=url) date = email.utils.parsedate_tz(item["published"]) timestamp = email.utils.mktime_tz(date) iso = datetime.datetime.utcfromtimestamp(timestamp).isoformat() filename = url.split(",")[-1].split(".")[0] data = { "text": extractor.getText(), "date": iso, "url": url, "filename": filename } except Exception as e: print "Error downloading article from " + url articles.append(data) return articles
def scrape(feed, used, excep, split1, split2, urlName, nameF): arrLinks = [] req = requests.get('http://feeds.reuters.com/reuters/businessNews') soupRss = BeautifulSoup(req.text, "html5lib") # Checks list of already queried links logrFile = open(used, "r") usedLinks = [line.strip() for line in logrFile] logrFile.close() # Extracts links from inital feed, excluding non-news for link in soupRss.find_all('guid'): arrLinks.append( str(link.getText().replace('?feedType=RSS&feedName=businessNews', ''))) # Store currently extracted links as not to repeat log_file = open(used, "w") for item in arrLinks: log_file.write(str(item) + "\n") log_file.close() # Extracts stripped news content with timestamp, omitting used links for item in arrLinks: fileName = str(item.rsplit('/', split1)[split2]) if any(fileName in s for s in usedLinks): print fileName + " has been extracted." else: extractedText = Extractor(extractor='ArticleExtractor', url=urlName + fileName) print fileName + ": New" write_file = open("extractedFiles/" + nameF + fileName + ".txt", "w") write_file.write(str(datetime.date.today()) + "\n") write_file.write(str(extractedText.getText().encode("utf-8"))) write_file.close()
def extract(args): if not os.path.isfile("articles.json"): print "File articles.json does not exist" print "Have you already crawled?" exit() with open("articles.json") as article_list: articles = [ json.loads(line) for line in article_list.read().splitlines() ] for article in articles: if args.html: with open(article['path'], "rb") as html: extractor = Extractor(extractor='ArticleExtractor', html=html.read()) else: extractor = Extractor(extractor='ArticleExtractor', url=article['url']) dirname = os.path.join("articles", article['domain']) + "/text" if not os.path.exists(dirname): os.makedirs(dirname) filename = sha1(article['url']).hexdigest() + '.txt' path = os.path.join(dirname, filename) with open(path, "wb+") as extracted_text: extracted_text.write(extractor.getText().encode("utf-8"))
def extract_article(url): r = requests.get(url) # the the url exists, continue if r.status_code == 200: # extract and parse response url url = parse_url(r.url) # extract html html = r.content.decode('utf-8', errors='ignore') # run boilerpipe BP = Extractor(html=html) # run readability Rdb = Document(html) html = Rdb.summary() # return article data return { 'extracted_title': Rdb.short_title().strip(), 'extracted_content': strip_tags(BP.getText()), } # otherwise return an empty dict else: return {}
def extract_body_with_boilerpipe(html): """ Extractor types: DefaultExtractor ArticleExtractor ArticleSentencesExtractor KeepEverythingExtractor KeepEverythingWithMinKWordsExtractor LargestContentExtractor NumWordsRulesExtractor CanolaExtractor Reference: https://github.com/misja/python-boilerpipe Note: set JAVA_HOME if import fails Returns -------- str: extracted body text. Return empty string if extraction fails """ try: extractor = Extractor(extractor='KeepEverythingExtractor', html=html) extracted_text = extractor.getText() except: print "Failed to extract text with boilerpipe" extracted_text = "" return extracted_text
def extract_and_save(url, path): try: handle = urllib2.urlopen(url) html_content = handle.read() extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content) text = extractor.getText() if text: if detect_english(text): links = get_all_urls(html_content, url) for link in links: try: handle = urllib2.urlopen(url) html_content = handle.read() #extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content) #text_content = extractor.getText() #if text_content: # if detect_english(text_content): encoded_url = encode(link) f = open(path + "/" + encoded_url, "w") f.write(html_content) f.close() except: print url traceback.print_exc() return None except: print url traceback.print_exc() return None
def run(self): count = 0 docCount = self.doc_cursor.count() for doc in self.doc_cursor: url = doc['url'] if (self.keepText(url)): try: extractor = Extractor(extractor='ArticleExtractor', url=url) extracted_text = extractor.getText() if (len(extracted_text) > 0): title = extractor.getTitle() if title != None: doc['title'] = title doc['extracted_text'] = title + " " + extracted_text else: doc['extracted_text'] = extracted_text self.db_collection.save(doc) print 'OK -' + url except IOError, err: print "IOError with url " + url print str(err) except (LookupError): print "LookupError - Maybe not text or weird encoding " + url except (UnicodeDecodeError, UnicodeEncodeError): print "UnicodeDecodeError or UnicodeEncodeError- " + url
def extract_article_content(html, url): """ Disclaimer ---------- Copied from https://github.com/turi-code/how-to/blob/master/ extract_article_content_from_HTML.py Description ---------- Extract the primary textual content from an HTML news article. In many cases, the HTML source of news articles is littered with boilerplate text that you would not want to include when doing text analysis on the content the page. Even if you could write some rules to extract the content from one page, it's unlikely that those rules would apply to an article from another site. The boilerpipe module allows us to solve this problem more generally. Parameters ---------- html : str The source HTML from which to extract the content. url : str The url, needed for logging purposes only Returns ------- out : str The primary content of the page with all HTML and boilerplate text removed. Examples -------- >>> extract_article_content( "<html><body><p>Turi is in the business of building the best " \ "machine learning platform on the planet. Our goal is to make " \ "it easy for data scientists to build intelligent, predictive " \ "applications quickly and at scale. Given the perplexing array " \ "of tools in this space, we often get asked "Why Turi? What " \ "differentiates it from tools X, Y, and Z?" This blog post aims " \ "to provide some answers. I’ll go into some technical details " \ "about the challenges of building a predictive application, and " \ "how Turi’s ML platform can help.</p></body></html>") >>> Turi is in the business of building the best " \ "machine learning platform on the planet. Our goal is to make " \ "it easy for data scientists to build intelligent, predictive " \ "applications quickly and at scale. Given the perplexing array " \ "of tools in this space, we often get asked "Why Turi? What " \ "differentiates it from tools X, Y, and Z?" This blog post aims " \ "to provide some answers. I’ll go into some technical details " \ "about the challenges of building a predictive application, and " \ "how Turi’s ML platform can help. See Also -------- - `Boilerpipe project <https://code.google.com/p/boilerpipe/>`_ - `Boilerpipe Python module <https://pypi.python.org/pypi/boilerpipe>`_ """ from boilerpipe.extract import Extractor if html and html.strip(): try: extractor = Extractor(extractor='ArticleExtractor', html=html) return extractor.getText() except Exception as e: error = "Function extract_article_content: " + url + " - " + str(e)
def process_text(self, text): if text == "": return text extractor = Extractor(extractor='ArticleExtractor', html=text) new_val = extractor.getText() return new_val
def scrape(feed, used, excep, split1, split2, urlName, nameF): arrLinks = [] req = requests.get('http://feeds.reuters.com/reuters/businessNews') soupRss = BeautifulSoup(req.text, "html5lib") logrFile = open(used,"r") usedLinks = [line.strip() for line in logrFile] logrFile.close() for link in soupRss.find_all('guid'): arrLinks.append(str(link.getText().replace('?feedType=RSS&feedName=businessNews', ''))) log_file = open(used,"w") for item in arrLinks: log_file.write(str(item)+"\n") log_file.close() for i in range(0, 8): fileName = arrLinks[i].rsplit('/', split1)[split2] #if any(fileName in s for s in usedLinks): # print fileName +" has been extracted." #else: extractedText = Extractor(extractor='ArticleExtractor', url=urlName+fileName) print fileName write_file = open("Data/"+str(i)+".txt","w") write_file.write(str(datetime.date.today()) + "\n") write_file.write(str(extractedText.getText().encode("utf-8"))) write_file.close()
def get_text_boilerpipe(html_text): try: extractor = Extractor(extractor='ArticleExtractor', html=html_text) return extractor.getText() except: print "Exception" return None
def parse_page(self, response): if response.meta.has_key('crawldepth'): depth = response.meta['crawldepth'] else: # Set search depth here depth = 1 log.msg('Depth = %s' % str(depth), level=log.INFO) if not isinstance(response, HtmlResponse): log.msg('Not an HTML file: %s' % response.url, level=log.WARNING) return log.msg('Response from: %s' % response.url, level=log.INFO) url_bf.add(response.url) # TODO: Extract page title extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode()) cleaned_text = extractor.getText() # Eliminate duplicates keywordset = set(keywordlist) found_list = [] for keyword in keywordset: # TODO: Is there a more efficient way to do this? # Look at word boundaries to match entire words only if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)): found_list.append(keyword) # Parse this page item = BiffleItem() if (len(found_list) > 0): item['url'] = response.url item['body'] = cleaned_text item['keywords'] = ', '.join(found_list) item['process_date'] = datetime.today() log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO) self.map_keyword_count(found_list) yield item if (depth > 0): # Find the next requests and yield those hxs = HtmlXPathSelector(response) links = hxs.select('//a/@href').extract() log.msg('Links on page: %s' % len(links), level=log.INFO) depth -= 1 log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO) for l in links: l = urlparse.urljoin(response.url, l) if (l in url_bf): pass #log.msg('Duplicate URL found: %s' % l, level=log.INFO) else: url_bf.add(l) #log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO) # Decrement depth for next layer of links #callback = lambda response, depth = depth: self.parse_page(response, depth) callback = lambda response: self.parse_page(response) request = Request(l, callback=callback) request.meta['crawldepth'] = depth yield request
def extract_article_text(url): if url in utils.BROKEN_URLS or any([True for sd in BAD_SUBDOMAINS if sd in url]): return "" while True: try: extractor = Extractor(extractor='ArticleExtractor', url=url) break except socket.timeout: print("got socket.timeout on url: {}. retrying...".format(url), file=utils.stddbg) except URLError as e: if e.reason == "timed out": print("got urllib 'timed out' on url {}. retrying...".format(url), file=utils.stddbg) elif hasattr(e.reason, "strerror") and e.reason.strerror == 'getaddrinfo failed': print("got urllib 'getaddrinfo failed' on url {}. retrying...".format(url), file=utils.stddbg) elif e.code == 503: print("got urllib 503 error on url {}. retrying...".format(url), file=utils.stddbg) else: if not hasattr(e, "url"): e.url = url raise except Exception as e: e.url = url raise e text = str(unicodedata.normalize('NFKD', (str(extractor.getText()))).encode('ascii', 'ignore')) return filter_junk(text)
def main(): parser = argparse.ArgumentParser() parser.add_argument("raw_dir_path") parser.add_argument("out_file_path") args = parser.parse_args() f_names = [(int(f), f) for f in listdir(args.raw_dir_path)] f_names = sorted(f_names) fout = open(args.out_file_path, 'w') for int_f_name, f_name in f_names: trec_reader = TrecReader(join(args.raw_dir_path, f_name)) empty_cnt = 0 err_cnt = 0 for docno, html_text in trec_reader: if not html_text: empty_cnt += 1 try: extractor = Extractor(extractor='ArticleExtractor', html=html_text) text = extractor.getText() text = text.replace('\n', ' ').replace('\t', ' ') text = text.encode('ascii', 'ignore') text = text_clean(text) if text: fout.write(docno + '\t' + text + '\n') else: empty_cnt += 1 except Exception as e: err_cnt += 1 fout.close() print empty_cnt, err_cnt
def extractor(URL): extractor = Extractor(extractor='ArticleExtractor', url=URL) data = extractor.getText() file = open("data.txt", "w") file.write(data.encode('UTF-8')) file.close() #Scinde la contenu en phrase with open('data.txt', 'r') as f: s = f.read() sentences = s.split('.') #Liste de mot vide w=[] #Scinde les phrase en mots for sentence in sentences : w.extend(sentence.split(' ')) print w #Retourne la liste de Mot return w
def post_index(post): extractor = Extractor(extractor='ArticleExtractor', url=post['href']) post_text = extractor.getText().replace('\n', ' ') url = 'http://localhost:9200/bookmarks/bookmark/%s/_create' % post['hash'] data = '{"title":"%s", "url":"%s", "text":"%s"}' % (post['description'], post['href'], post_text.replace('"', '\\"')) r = requests.put(url, data=data) print r.status_code
def Text_extractor(y, page, team, team_i, counter=0): """Extract the text of team pages using BoilerPipe.""" try: upage = urllib.parse.quote_plus(page) url = "http://" + y + ".igem.org/wiki/index.php?title=" + upage extractor = Extractor(extractor='ArticleExtractor', url=url) except: counter += 1 if counter > 10: print("Failed to get the text for page {}".format(page)) return None Text_extractor(y, page, team, team_i, counter=counter) f = open( 'results/%s/%s/%s_-_-_CONTENT.html' % (y, team, page.replace('/', '#')), 'w') f.write(extractor.getHTML()) f.close() f = open( 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')), 'w') f.write(extractor.getText()) f.close() path = 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')) # text = text.replace('\\n', '\\\\n') output = '%s\t%s\t%s\t%s\n' % (y, str(teams_id[team_i]), page, path) teams_pages_text_db.write(output)
def extract_metadata(url): extractor = Extractor(extractor='KeepEverythingExtractor', url=url) text = extractor.getText().split("\n") author = None date = None keywords = [] find_keywords = False for line in text: #author match = re.match("Von\s(\w+\s\w+)(,\s[\s\w]*$|$)", line) if match: author = match.group(1) continue #date match = re.match("([0-9]{2}\.[0-9]{2}\.[0-9]{4})$", line) if match: date = match.group(1) continue #keywords if find_keywords: match = re.match("Hat\sIhnen\sdieser\sArtikel\sgefallen.*", line) if match: find_keywords = False continue else: keywords.append(line) match = re.match("Schlagwörter zu diesem Artikel:", line) if match: find_keywords = True return author, date, keywords
def get_articles(url): doc = urllib.request.urlopen(url) docContent = BeautifulSoup(doc, 'html.parser') articles = [] for element in docContent.find_all('div'): try: if element.attrs['style'] == 'width:550px': article = defaultdict(str) article_link = 'http://www.moneycontrol.com' + element.a['href'] for p in element.find_all('p'): if 'a_10dgry' in p.attrs['class']: article_time = p.contents[0].split('|')[0] article_date = p.contents[0].split('|')[1][:-1] article['link'] = article_link article['time'] = article_time article['date'] = article_date extractor = Extractor(extractor='ArticleExtractor', url=article_link) article['content'] = extractor.getText() article['title'] = BeautifulSoup(extractor.getHTML(), 'html.parser').find_all('h1')[0].contents[0] articles.append(article) break except: logging.debug('div has no width attribute') return articles
def detag_html_file(infile, outfile, id): from boilerpipe.extract import Extractor if not USE_BOILERPLATE: return detag_html_file_bs(infile, outfile, id) tempfile = "%s.tmp.html" % (infile,) # boilerplate seems to need an html extension try: copyfile(infile, tempfile) extractor = Extractor(extractor='ArticleExtractor', url="file://"+tempfile) os.unlink(tempfile) extracted_text = extractor.getText() extracted_html = extractor.getHTML() soup = BeautifulSoup(extracted_html) output = codecs.open(outfile, encoding='utf-8', mode='w') output.write(u"<DOC>\n<DOCNO>" + unicode(id) + u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n"); head = soup.find('head') if head: title_tag = head.find('title') if title_tag and title_tag.string: output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') + u"</TITLE>\n") extract_para(soup, output) output.write(u"</DOC>\n") output.close() except Exception, exc: try: os.unlink(tempfile) except: pass return detag_html_file_bs(infile, outfile, id)
def ParserBoilerEverything(html_object): extractor = Extractor(extractor='DefaultExtractor', html=html_object) sents = extractor.getText() try: return sents except Exception as e: return
def parse_item(self, response): response_news = NewsItem() response_news['url'] = response.url response_news['html'] = Binary(zlib.compress(response.body, 9)) extractor = Extractor(extractor='ArticleExtractor', html=response.body) response_news['content'] = extractor.getText() return response_news
def Process(DocIn,OutName): out = open(OutName,'w') logging.info('reading [%s]', DocIn) ErrCnt = 0 EmptyCnt = 0 for cnt,line in enumerate(open(DocIn)): vCol = line.strip().split('\t') DocNo = vCol[0] RawHtml = ' '.join(vCol[1:]) RawHtml = DiscardHTMLHeader(RawHtml) if "" == RawHtml: EmptyCnt += 1 continue try: extractor = Extractor(extractor='ArticleExtractor',html=RawHtml) text = extractor.getText() text = text.replace('\n',' ').replace('\t',' ') text = text.encode('ascii','ignore') text = TextClean(text) if "" != text: print >>out, DocNo + '\t' + text else: EmptyCnt += 1 # print DocNo + '\t' + text.encode('ascii','ignore') except Exception as e: ErrCnt += 1 if 0 == (cnt % 100): logging.info('parsed [%d] doc [%d] Err [%d] Empty', cnt,ErrCnt,EmptyCnt) out.close() logging.info('finished [%d] doc [%d] Err', cnt,ErrCnt)
def extract_main_text(self): if self.res is None: return None extractor = Extractor( # extractor='ArticleExtractor', url=self.url) return [extractor.getText()]
def parse(self, response): hxs = Selector(response) item = ArticleItem() item["title"] = hxs.xpath('//title/text()').extract() item["link"] = response.url item["source"] = hxs.xpath('//p').extract() extractor = Extractor(extractor='ArticleExtractor', url=item["link"]) source = extractor.getHTML() item["text"] = extractor.getText() item["html"] = source page = html.fromstring(source) links = page.xpath("//p//a/@href") linkPattern = re.compile("^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+))?$") for link in links: if linkPattern.match(link) and not link in self.crawled_links: self.crawled_links.append(link) yield Request(link, self.parse) yield item
def GOOGLE_get_data(company): google_news_rss_url = "https://news.google.com/news/?q=%s&output=rss" % company rss_feed = feedparser.parse(google_news_rss_url) content_list = list() for entry in rss_feed['entries']: title = entry['title'] link = entry['link'] try: news_page = urllib2.urlopen(link).read() extractor = Extractor(extractor='ArticleExtractor', html=news_page) except: continue content = extractor.getText() now = datetime.datetime.now() content_list.append({"title": title, "article": content, "link": link, "source": "GOOGLE", "target": company, "date": "%04d%02d%02d" % (now.year, now.month, now.day), "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()}) DBOperation.save_db(content_list)
def dehydrate(self, bundle): """GET Method""" #print bundle.data['content'] if bundle.data['content']: extractor = Extractor(extractor='ArticleExtractor', html=bundle.data['content']) bundle.data['content'] = extractor.getText() try: article_stats = ArticleStat.objects.filter(article_id=bundle.obj.id) bundle.data['stat'] = { 'reads': sum(map(lambda x: x.reads, article_stats)), 'likes': sum(map(lambda x: x.likes, article_stats)), 'dislikes': sum(map(lambda x: x.dislikes, article_stats)), 'shares': sum(map(lambda x: x.shares, article_stats)), } except ObjectDoesNotExist: bundle.data['stat'] = { 'reads': 0, 'likes': 0, 'dislikes': 0, 'shares': 0, } # no cookies or no sessionid field in cookies, then just send normal # newsfeed to anonymous user #if not bundle.request.COOKIES or not bundle.request.COOKIES['sessionid']: if not bundle.request.COOKIES or not 'sessionid' in bundle.request.COOKIES: return bundle try: # even if there is a cookie, sessionid field might be not exist, # then it is also anonymous user s = get_current_session(bundle.request.COOKIES['sessionid']) if s is None or 'user_id' not in s: return bundle # get activity information whether user has already # read/liked/shared activity = Activities.objects.get(user_id=s['user_id'], \ article_id=bundle.obj.id) # assign information bundle.data['activity'] = { 'read': activity.like or activity.share, 'like': activity.like, 'dislike': activity.dislike, 'share': activity.share } except ObjectDoesNotExist: # assign False if the news has never been opened bundle.data['activity'] = { 'read': False, 'like': False, 'dislike': False, 'share': False } return bundle
def get_text_boilerpipe(url): try: extractor = Extractor(extractor='ArticleExtractor', url=url) extracted_text = extractor.getText() except BaseException as error: extracted_text = 'error: {}'.format(error) print('error: {}'.format(error)) return extracted_text
def sentences_from_urls(url: str, extractor_name=EXTRACTORS[0], model=MODELS[0], min_words=0, with_proba=False, return_raw=False): extractor = Extractor(extractor=extractor_name) model = models[model] extracted_text = extractor.getTextBlocks(url=url) if len(extracted_text) > 0: func = model.predict_proba if with_proba else model.predict return func(extracted_text, min_words=min_words, return_raw=return_raw)
def html2text_bp(html): text = None try: extractor = Extractor(extractor=extractor_type, html=html) text = extractor.getText() except: traceback.print_exc() return text
def extract_text(html_content): try: extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content) #print extractor.getText() return extractor.getText() except: print "Exception in html extraction" return None
def extract_html(html_text, parser): try: extractor = Extractor(extractor=parser, html=html_text) except Exception as e: return None, None title = extractor.source.getTitle() body_text = extractor.getText() return title, body_text
def saveHtml(url, page): extractor = Extractor(extractor='ArticleExtractor', html=page) processed_plaintext = extractor.getText() # print processed_plaintext fileName = "./doc/" + (url + ".txt").replace("/", "()") f = open(fileName, "w") f.write(processed_plaintext) f.close()
def ParserBoilerDefault(html_object): extractor = Extractor(extractor='DefaultExtractor', html=html_object) sents = extractor.getText() try: sents = list(nlp(sents).sents) return sents except Exception as e: return
def boilerpipe_text(cls,url_in=None,html_in=None,extractor='ArticleExtractor'): assert (url_in!=None) != (html_in!=None) # one, not both inp=url_in or html_in if url_in: extractor = Extractor(extractor=extractor, url=inp) else: extractor = Extractor(extractor=extractor, html=inp) return HtmlTextCleaner().spec_text_cleaner(extractor.getText())
def getBoilerPlate(url): #url = 'http://cnn.com/2016/07/17/health/south-africa-meerkat-telescope-galaxies/index.html' try: extractor = Extractor(extractor='ArticleExtractor', url=url) extracted_text = extractor.getText().replace('\n', '') return '', extracted_text except Exception, e: return '', ''
def ParserBoilerArticle(html_object): extractor = Extractor(extractor='ArticleSentencesExtractor', html=html_object) sents = extractor.getText() try: sents = list(nlp(sents).sents) return sents except Exception as e: return
def load_from_html(filename, use_boilerpipe=True, use_nltk=True, use_regex=True, binary=False): if binary: charset = UnicodeDammit(open(filename, 'rb').read()) charset = charset.original_encoding try: content = open(filename, 'r', encoding=charset).read() except Exception as e: # if has error, return empty results logging.warn('encode error: {}, {}'.format(filename, e)) return {'title': [], 'body': []} else: content = open(filename, 'r', encoding='utf-8').read() start = time.time() if not use_regex or not use_boilerpipe: bs = BeautifulSoup(content, 'html.parser') if use_regex: match = re.search(r'<title.*?>(.+?)</title>', content[:5000], re.DOTALL | re.IGNORECASE) title = match.group(1) if match else '' title = html.unescape(title).strip() else: if bs.title != None and bs.title.string != None: title = bs.title.string.strip() else: title = '' t1 = time.time() - start start = time.time() if use_boilerpipe: extractor = Extractor(extractor='ArticleExtractor', html=content) # time consuming body = extractor.getText() else: body = bs.select('body') if len(body) <= 0: body = bs else: body = body[0] # remove all useless label [x.extract() for x in body.findAll('script')] [x.extract() for x in body.findAll('style')] [x.extract() for x in body.findAll('meta')] [x.extract() for x in body.findAll('link')] body = body.text t2 = time.time() - start start = time.time() result = { 'title': my_word_tokenize(title) if use_nltk else clean_text(title).split(' '), 'body': my_word_tokenize(body) if use_nltk else clean_text(body).split(' '), } t3 = time.time() - start #print('{}\t{}\t{}'.format(t1, t2, t3)) return result
def parse_item(self, response): title = response.css('title::text').extract_first() extractor = Extractor(extractor='ArticleExtractor', html=response.body) yield Article(title=title, text=extractor.getText(), url=response.url, field=self.name)
def extract_article(html_text): try: extractor = Extractor(extractor='ArticleExtractor', html=html_text) text_string = extractor.getText() text_string = htmlParser.unescape(text_string) except Exception: logger.error('Error extracting article html') text_string = '' return text_string
def get_news_by_url(url): print "Come to get_news_by_url" article = {} try: soup = BeautifulSoup(urllib2.urlopen(url)) "Get the title of News" title = "" titleElements = soup.findAll(id="disqus_title") for ele in titleElements: title = ele.getText().encode('utf-8') article["title"] = title print title "Get the posttime of News,Timezone ET" postTime = "" postTimeElements = soup.findAll(attrs={'class': "datestamp"}) for ele in postTimeElements: timeStamp = float(ele["epoch"]) postTime = datetime.fromtimestamp(timeStamp / 1000) article["post_time"] = postTime "Initiate the post date" postDay = postTime.date() article["post_date"] = postDay "Get the author information " author = "" authorElements = soup.findAll(attrs={'class': "byline"}) for ele in authorElements: author = ele.contents[0].strip().replace("By", "").replace( "-", "").replace("and", ",").strip() article["author"] = author "Get the content of article" extractor = Extractor(extractor='ArticleExtractor', url=url) content = extractor.getText().encode("utf-8") article["content"] = content "Initiate the Sources" source = "Bloomberg News" article["source"] = source "Initiate the update_time" updateTime = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S") article["update_time"] = updateTime "Initiate the embers_id" embersId = hashlib.sha1(content).hexdigest() article["embers_id"] = embersId "settup URL" article["url"] = url except: print "Error: %s" % sys.exc_info()[0] article = {} finally: return article
def get_news_by_url(url): print "Come to get_news_by_url" article = {} try: soup = BeautifulSoup(urllib2.urlopen(url)) "Get the title of News" title = "" titleElements = soup.findAll(id="disqus_title") for ele in titleElements: title = ele.getText().encode('utf-8') article["title"] = title print title "Get the posttime of News,Timezone ET" postTime = "" postTimeElements = soup.findAll(attrs={'class':"datestamp"}) for ele in postTimeElements: timeStamp = float(ele["epoch"]) postTime = datetime.fromtimestamp(timeStamp/1000) article["post_time"] = postTime "Initiate the post date" postDay = postTime.date() article["post_date"] = postDay; "Get the author information " author = "" authorElements = soup.findAll(attrs={'class':"byline"}) for ele in authorElements: author = ele.contents[0].strip().replace("By","").replace("-","").replace("and", ",").strip(); article["author"] = author "Get the content of article" extractor=Extractor(extractor='ArticleExtractor',url=url) content = extractor.getText().encode("utf-8") article["content"] = content "Initiate the Sources" source = "Bloomberg News" article["source"] = source "Initiate the update_time" updateTime = datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S") article["update_time"] = updateTime "Initiate the embers_id" embersId = hashlib.sha1(content).hexdigest() article["embers_id"] = embersId "settup URL" article["url"] = url except: print "Error: %s" %sys.exc_info()[0] article = {} finally: return article
def html_to_text(html): try: extractor = Extractor(extractor='ArticleExtractor', html=html) except Exception as e: logger.exception('\nError extracting text from html. Exception: %s, %s', e.__class__.__name__, e) return '' text = extractor.getText() text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') return text
def extract_article(html_text): try: extractor = Extractor(extractor='ArticleExtractor', html=html_text) text_string = extractor.getText() text_string = htmlParser.unescape(text_string) text_string = unicodedata.normalize('NFKD', text_string).encode('ascii','ignore') except Exception: print 'Error extracting article html' text_string = '' return text_string
def test_boilerpipe(): your_url = "http://stackoverflow.com/questions/9352259/trouble-importing-boilerpipe-in-python" extractor = Extractor(extractor='ArticleExtractor', url=your_url) extracted_html = extractor.getHTML() extracted_text = extractor.getText() print '\nfunction: %s ' % inspect.stack()[0][3] print 'extracted html: %i text: %i' % (len(extracted_html), len(extracted_text)) print '' n.assert_greater(len(extracted_text), min_str_length)
def extract(self, article): try: extractor = Extractor(extractor='ArticleSentencesExtractor', url=article.url) except Exception as e: return '' article_text = '' try: article_text = extractor.getText() except Exception: pass return article_text.encode('utf-8')
def main(): contents = sys.argv[1] for url in listdir(contents): print url with codecs.open(url, "w", encoding="utf-8") as out: try: html = urlopen(url.replace("{", "/")).read() extracted = Extractor(html=html) out.write(extracted.getText()) except HTTPError: out.write("")
def fetch_articles(self): greq_gen = (grequests.get(u, headers=self.header,) for u in self.urls) responses = grequests.map(greq_gen) for i,res in enumerate(responses): if res is not None: extractor = Extractor(html=res.text) self.entries[i]['text'] = extractor.getText() if '...' in self.entries[i]['title']: self.entries[i]['title'] = extractor.getTitle() return True
def boiler(): from boilerpipe.extract import Extractor for i in range(0, 1000): input_filename = 'page/' + str(i) + '.txt' output_filename = 'boilerpipe/' + str(i) + '.txt' input_file = open(input_filename, 'r') s = input_file.read() input_file.close() extractor = Extractor(extractor='ArticleExtractor', html=s.decode('GBK', 'ignore')) output_file = open(output_filename, 'wb') output_file.write(extractor.getText().encode('utf-8')) output_file.close()
def fetch_info(self): """ boilerpipeの本文抽出は余計な部分まで取得することは少ないけれど、本来より少ないことは多々ある """ urls = self.get_urls() got_infos = [] for url in urls: extractor = Extractor(extractor='ArticleExtractor', url=url) text = extractor.getText() content = requests.get(url).content got_infos.append([url, text, content]) return got_infos