def traverse(outlet_url, num=5000): retrieved = 0 page = 1 while retrieved < num: current_url = "{}?page={}".format(outlet_url, page) #print(current_url) extractor = Extractor(extractor='KeepEverythingExtractor', url=current_url) html = extractor.getHTML() #print(html) link1 = re.findall(r"<A\shref=\"/artikel/(\d{4}/\d{2}/.*?)\"", html) #print(link1) links = set(link1) #print(links) for link in links: try: art_url = 'https://jungle.world/artikel/' + link #print(art_url) extr = Extractor(extractor='ArticleExtractor', url=art_url) text = extr.getText() #print(text) retrieved += 1 write_to_files(text, art_url, retrieved) print("Extracted {}: {}".format(retrieved, art_url)) except Exception as e: print(e) page += 1
def traverse(outlet_url, num=3000): retrieved = 0 page = 1 while retrieved < num: if page < 2: page_url = outlet_url else: page_url = "{}page/{}".format(outlet_url, page) extractor = Extractor(extractor='KeepEverythingExtractor', url=page_url) html = extractor.getHTML() links = re.findall(r"<A\shref=\".*de\/(\d{4}\/\d{2}\/\d{2}\/.*)\" rel", html) for link in links: try: art_url = outlet_url + link extr = Extractor(extractor='ArticleExtractor', url=art_url) text = extr.getText() retrieved += 1 write_to_files(text, art_url, retrieved) print("Extracted {}: {}".format(retrieved, art_url)) except Exception as e: print(e) page += 1
def extract(args): if not os.path.isfile("articles.json"): print "File articles.json does not exist" print "Have you already crawled?" exit() with open("articles.json") as article_list: articles = [ json.loads(line) for line in article_list.read().splitlines() ] for article in articles: if args.html: with open(article['path'], "rb") as html: extractor = Extractor(extractor='ArticleExtractor', html=html.read()) else: extractor = Extractor(extractor='ArticleExtractor', url=article['url']) dirname = os.path.join("articles", article['domain']) + "/text" if not os.path.exists(dirname): os.makedirs(dirname) filename = sha1(article['url']).hexdigest() + '.txt' path = os.path.join(dirname, filename) with open(path, "wb+") as extracted_text: extracted_text.write(extractor.getText().encode("utf-8"))
def boilerpipe_text(cls,url_in=None,html_in=None,extractor='ArticleExtractor'): assert (url_in!=None) != (html_in!=None) # one, not both inp=url_in or html_in if url_in: extractor = Extractor(extractor=extractor, url=inp) else: extractor = Extractor(extractor=extractor, html=inp) return HtmlTextCleaner().spec_text_cleaner(extractor.getText())
def extractHTML(self, url=None, html=None, extractor='ArticleExtractor'): cherrypy.response.headers['Content-Type'] = "text/json" if url: extractor = Extractor(extractor=extractor, url=url) extracted_html = extractor.getHTML() return json.dumps({'url': url, 'extractedHTML': extracted_html}) elif html: extractor = Extractor(extractor=extractor, url=url) extracted_html = extractor.getHTML() return json.dumps({ 'html': html[:15], 'extractedHTML': extracted_html })
def extract_cleaned_text_from_url(url: str, proxy: dict = None) -> Union[str, None]: """Extracts the main text from an URL using boilerpipe if its an HTML and tika if it is an PDF Args: url: An url to extract text from proxy: Proxy settings for requests Returns: Extracted text """ if 'pdf' in url: tika.TikaClientOnly = True extracted = _extract_cleaned_text_from_pdf(url) else: kwargs = {'url': url} if 'promed' in url: html = get_html_from_promed_url(url, proxy) kwargs = {'html': html} try: extracted = Extractor(extractor='ArticleExtractor', **kwargs).getText() except Exception as e: print(f'{url} caused {e}') extracted = None return _remove_control_characters(extracted)
def extract_and_save(url, path): try: handle = urllib2.urlopen(url) html_content = handle.read() extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content) text = extractor.getText() if text: if detect_english(text): links = get_all_urls(html_content, url) for link in links: try: handle = urllib2.urlopen(url) html_content = handle.read() #extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content) #text_content = extractor.getText() #if text_content: # if detect_english(text_content): encoded_url = encode(link) f = open(path + "/" + encoded_url, "w") f.write(html_content) f.close() except: print url traceback.print_exc() return None except: print url traceback.print_exc() return None
def articles_from_feed(): articles = [] feed = feedparser.parse(rss_fakt) for item in feed["items"]: url = convert_url(item["link"]) print item["published"] print url try: extractor = Extractor(extractor="ArticleExtractor", url=url) date = email.utils.parsedate_tz(item["published"]) timestamp = email.utils.mktime_tz(date) iso = datetime.datetime.utcfromtimestamp(timestamp).isoformat() filename = url.split(",")[-1].split(".")[0] data = { "text": extractor.getText(), "date": iso, "url": url, "filename": filename } except Exception as e: print "Error downloading article from " + url articles.append(data) return articles
def ParserBoilerEverything(html_object): extractor = Extractor(extractor='DefaultExtractor', html=html_object) sents = extractor.getText() try: return sents except Exception as e: return
def extract_metadata(url): extractor = Extractor(extractor='KeepEverythingExtractor', url=url) text = extractor.getText().split("\n") author = None date = None keywords = [] find_keywords = False for line in text: #author match = re.match("Von\s(\w+\s\w+)(,\s[\s\w]*$|$)", line) if match: author = match.group(1) continue #date match = re.match("([0-9]{2}\.[0-9]{2}\.[0-9]{4})$", line) if match: date = match.group(1) continue #keywords if find_keywords: match = re.match("Hat\sIhnen\sdieser\sArtikel\sgefallen.*", line) if match: find_keywords = False continue else: keywords.append(line) match = re.match("Schlagwörter zu diesem Artikel:", line) if match: find_keywords = True return author, date, keywords
def extract_article_text(url): if url in utils.BROKEN_URLS or any([True for sd in BAD_SUBDOMAINS if sd in url]): return "" while True: try: extractor = Extractor(extractor='ArticleExtractor', url=url) break except socket.timeout: print("got socket.timeout on url: {}. retrying...".format(url), file=utils.stddbg) except URLError as e: if e.reason == "timed out": print("got urllib 'timed out' on url {}. retrying...".format(url), file=utils.stddbg) elif hasattr(e.reason, "strerror") and e.reason.strerror == 'getaddrinfo failed': print("got urllib 'getaddrinfo failed' on url {}. retrying...".format(url), file=utils.stddbg) elif e.code == 503: print("got urllib 503 error on url {}. retrying...".format(url), file=utils.stddbg) else: if not hasattr(e, "url"): e.url = url raise except Exception as e: e.url = url raise e text = str(unicodedata.normalize('NFKD', (str(extractor.getText()))).encode('ascii', 'ignore')) return filter_junk(text)
def post_index(post): extractor = Extractor(extractor='ArticleExtractor', url=post['href']) post_text = extractor.getText().replace('\n', ' ') url = 'http://localhost:9200/bookmarks/bookmark/%s/_create' % post['hash'] data = '{"title":"%s", "url":"%s", "text":"%s"}' % (post['description'], post['href'], post_text.replace('"', '\\"')) r = requests.put(url, data=data) print r.status_code
def reader_processor(csv, cols=cols): """ Takes input of a csv of the cross between stats.csv and articles.csv as 'day<date of experiment>.csv' entities: Returns the entity list of individual article. total: Returns the total of all the entities identified. """ df = pd.read_csv(csv, names=cols) total = [] entites = [] for i, row in df.iterrows(): #MINE ONLY SPECIFIC ARTICLES [>=6 (12 seconds)] if (row["counter"] >= 6): ## FORM A NEW TEXT text = row["articleBody"] + " " + row["headline"] #### if readmore has been clicked if (row["readmore"] == 1): try: #use of boilerpipe external = Extractor(extractor='ArticleExtractor', url=row["source"]).getText() except: external = '' text = text + " " + external entity, w_entity = entity_extractor(text, row["counter"]) entites.append(entity) total.extend(w_entity) return entites, total
def extract_article_content(html, url): """ Disclaimer ---------- Copied from https://github.com/turi-code/how-to/blob/master/ extract_article_content_from_HTML.py Description ---------- Extract the primary textual content from an HTML news article. In many cases, the HTML source of news articles is littered with boilerplate text that you would not want to include when doing text analysis on the content the page. Even if you could write some rules to extract the content from one page, it's unlikely that those rules would apply to an article from another site. The boilerpipe module allows us to solve this problem more generally. Parameters ---------- html : str The source HTML from which to extract the content. url : str The url, needed for logging purposes only Returns ------- out : str The primary content of the page with all HTML and boilerplate text removed. Examples -------- >>> extract_article_content( "<html><body><p>Turi is in the business of building the best " \ "machine learning platform on the planet. Our goal is to make " \ "it easy for data scientists to build intelligent, predictive " \ "applications quickly and at scale. Given the perplexing array " \ "of tools in this space, we often get asked "Why Turi? What " \ "differentiates it from tools X, Y, and Z?" This blog post aims " \ "to provide some answers. I’ll go into some technical details " \ "about the challenges of building a predictive application, and " \ "how Turi’s ML platform can help.</p></body></html>") >>> Turi is in the business of building the best " \ "machine learning platform on the planet. Our goal is to make " \ "it easy for data scientists to build intelligent, predictive " \ "applications quickly and at scale. Given the perplexing array " \ "of tools in this space, we often get asked "Why Turi? What " \ "differentiates it from tools X, Y, and Z?" This blog post aims " \ "to provide some answers. I’ll go into some technical details " \ "about the challenges of building a predictive application, and " \ "how Turi’s ML platform can help. See Also -------- - `Boilerpipe project <https://code.google.com/p/boilerpipe/>`_ - `Boilerpipe Python module <https://pypi.python.org/pypi/boilerpipe>`_ """ from boilerpipe.extract import Extractor if html and html.strip(): try: extractor = Extractor(extractor='ArticleExtractor', html=html) return extractor.getText() except Exception as e: error = "Function extract_article_content: " + url + " - " + str(e)
def scrape(file, split1, split2, urlName): links_from_RSS_feed = [] Requests_from_RSS = requests.get( 'http://feeds.reuters.com/reuters/businessNews') Rss_soup = BeautifulSoup(Requests_from_RSS.text, "html5lib") lFile = open(file, "r") usedLinks = [line.strip() for line in lFile] lFile.close() for link in Rss_soup.find_all('guid'): links_from_RSS_feed.append( str(link.getText().replace('?feedType=RSS&feedName=businessNews', ''))) l_file = open(file, "w") for item in links_from_RSS_feed: l_file.write(str(item) + "\n") l_file.close() no_of_links = len(links_from_RSS_feed) for i in range(0, no_of_links): fileName = links_from_RSS_feed[i].rsplit('/', split1)[split2] extractedText = Extractor(extractor='ArticleExtractor', url=urlName + fileName) print(fileName) write_file = open("Data/" + str(i) + ".txt", "w") write_file.write(str(datetime.date.today()) + "\n") write_file.write(str(extractedText.getText().encode("utf-8"))) write_file.close() return no_of_links
def scrap_link_boilerpipe(url): try: extractor = Extractor(extractor='ArticleSentencesExtractor', url=url) return extractor.getText() except: return False
def scrape(feed, used, excep, split1, split2, urlName, nameF): arrLinks = [] req = requests.get('http://feeds.reuters.com/reuters/businessNews') soupRss = BeautifulSoup(req.text, "html5lib") logrFile = open(used,"r") usedLinks = [line.strip() for line in logrFile] logrFile.close() for link in soupRss.find_all('guid'): arrLinks.append(str(link.getText().replace('?feedType=RSS&feedName=businessNews', ''))) log_file = open(used,"w") for item in arrLinks: log_file.write(str(item)+"\n") log_file.close() for i in range(0, 8): fileName = arrLinks[i].rsplit('/', split1)[split2] #if any(fileName in s for s in usedLinks): # print fileName +" has been extracted." #else: extractedText = Extractor(extractor='ArticleExtractor', url=urlName+fileName) print fileName write_file = open("Data/"+str(i)+".txt","w") write_file.write(str(datetime.date.today()) + "\n") write_file.write(str(extractedText.getText().encode("utf-8"))) write_file.close()
def extract_body_with_boilerpipe(html): """ Extractor types: DefaultExtractor ArticleExtractor ArticleSentencesExtractor KeepEverythingExtractor KeepEverythingWithMinKWordsExtractor LargestContentExtractor NumWordsRulesExtractor CanolaExtractor Reference: https://github.com/misja/python-boilerpipe Note: set JAVA_HOME if import fails Returns -------- str: extracted body text. Return empty string if extraction fails """ try: extractor = Extractor(extractor='KeepEverythingExtractor', html=html) extracted_text = extractor.getText() except: print "Failed to extract text with boilerpipe" extracted_text = "" return extracted_text
def process_item(self, html_page): try: publish_date = examine(html_page['html']) from boilerpipe.extract import Extractor extractor = Extractor(extractor='ArticleExtractor', html=html_page['html']) body = str(extractor.getText()) title = str(extractor.source.getTitle()) art = { 'title': title, 'body': body, 'lang': self.lang, 'source': html_page['source'], 'url': html_page['url'], 'crawl_date': html_page['timestamp'], 'publish_date': publish_date, 'article_id': sha1(html_page['url'].encode('utf-8')).hexdigest(), 'sentences': [] } if self.art_ok(art['body']): content = art['body'] content = content.replace(u'\xa0', u' ') content = content.replace('\\n', '\n') sents = [] if self.lang == 'en': sents = sent_tokenize(content) else: for para in content.split('\n'): sents += sentence_split(para, self.lang) sents = [sent for sent in sents if self.check_sent(sent)] art['sentences'] = sents if len(sents) >= 3: self.output_corpus.add_instance(art) except Exception as e: pass
def Text_extractor(y, page, team, team_i, counter=0): """Extract the text of team pages using BoilerPipe.""" try: upage = urllib.parse.quote_plus(page) url = "http://" + y + ".igem.org/wiki/index.php?title=" + upage extractor = Extractor(extractor='ArticleExtractor', url=url) except: counter += 1 if counter > 10: print("Failed to get the text for page {}".format(page)) return None Text_extractor(y, page, team, team_i, counter=counter) f = open( 'results/%s/%s/%s_-_-_CONTENT.html' % (y, team, page.replace('/', '#')), 'w') f.write(extractor.getHTML()) f.close() f = open( 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')), 'w') f.write(extractor.getText()) f.close() path = 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')) # text = text.replace('\\n', '\\\\n') output = '%s\t%s\t%s\t%s\n' % (y, str(teams_id[team_i]), page, path) teams_pages_text_db.write(output)
def scrape(feed, used, excep, split1, split2, urlName, nameF): arrLinks = [] req = requests.get('http://feeds.reuters.com/reuters/businessNews') soupRss = BeautifulSoup(req.text, "html5lib") # Checks list of already queried links logrFile = open(used, "r") usedLinks = [line.strip() for line in logrFile] logrFile.close() # Extracts links from inital feed, excluding non-news for link in soupRss.find_all('guid'): arrLinks.append( str(link.getText().replace('?feedType=RSS&feedName=businessNews', ''))) # Store currently extracted links as not to repeat log_file = open(used, "w") for item in arrLinks: log_file.write(str(item) + "\n") log_file.close() # Extracts stripped news content with timestamp, omitting used links for item in arrLinks: fileName = str(item.rsplit('/', split1)[split2]) if any(fileName in s for s in usedLinks): print fileName + " has been extracted." else: extractedText = Extractor(extractor='ArticleExtractor', url=urlName + fileName) print fileName + ": New" write_file = open("extractedFiles/" + nameF + fileName + ".txt", "w") write_file.write(str(datetime.date.today()) + "\n") write_file.write(str(extractedText.getText().encode("utf-8"))) write_file.close()
def get_text_boilerpipe(html_text): try: extractor = Extractor(extractor='ArticleExtractor', html=html_text) return extractor.getText() except: print "Exception" return None
def extract_main_text(self): if self.res is None: return None extractor = Extractor( # extractor='ArticleExtractor', url=self.url) return [extractor.getText()]
def extract_text(html_content): try: extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content) #print extractor.getText() return extractor.getText() except: print "Exception in html extraction" return None
def html2text_bp(html): text = None try: extractor = Extractor(extractor=extractor_type, html=html) text = extractor.getText() except: traceback.print_exc() return text
def get_text_boilerpipe(url): try: extractor = Extractor(extractor='ArticleExtractor', url=url) extracted_text = extractor.getText() except BaseException as error: extracted_text = 'error: {}'.format(error) print('error: {}'.format(error)) return extracted_text
def sentences_from_urls(url: str, extractor_name=EXTRACTORS[0], model=MODELS[0], min_words=0, with_proba=False, return_raw=False): extractor = Extractor(extractor=extractor_name) model = models[model] extracted_text = extractor.getTextBlocks(url=url) if len(extracted_text) > 0: func = model.predict_proba if with_proba else model.predict return func(extracted_text, min_words=min_words, return_raw=return_raw)
def getBoilerPlate(url): #url = 'http://cnn.com/2016/07/17/health/south-africa-meerkat-telescope-galaxies/index.html' try: extractor = Extractor(extractor='ArticleExtractor', url=url) extracted_text = extractor.getText().replace('\n', '') return '', extracted_text except Exception, e: return '', ''
def ParserBoilerDefault(html_object): extractor = Extractor(extractor='DefaultExtractor', html=html_object) sents = extractor.getText() try: sents = list(nlp(sents).sents) return sents except Exception as e: return
def ParserBoilerArticle(html_object): extractor = Extractor(extractor='ArticleSentencesExtractor', html=html_object) sents = extractor.getText() try: sents = list(nlp(sents).sents) return sents except Exception as e: return