def build(self, urls, docs=None, extacted_cache=False): """ """ indexed_docs = [] doc_id = 0 # --------------------------------- if docs and len(urls) != len(docs): raise ValueError for i,URL in enumerate(urls): doc = StrictIndexDocument() doc.id, doc.url = doc_id, URL doc_id += 1 if os.path.exists('./data/extracted-'+ str(doc_id) +'-cached.txt'): with codecs.open('./data/extracted-'+ str(doc_id) +'-cached.txt', 'r', encoding='utf-8') as f: extr_text = f.read() else: if docs: extractor = Extractor(extractor='ArticleExtractor', html=docs[i]) else: extractor = Extractor(extractor='ArticleExtractor', url=URL) extr_text = extractor.getText() # --------------------------------- if len(extr_text) < len(docs[i]) * self.threshold_extractor_fails: if docs: exKeepAll = Extractor(extractor='KeepEverythingExtractor', html=docs[i]) else: exKeepAll = Extractor(extractor='KeepEverythingExtractor', url=URL) extr_text = exKeepAll.getText() # --------------------------------- if extacted_cache: with codecs.open('./data/extracted-'+ str(doc_id) +'-cached.txt', 'w', encoding='utf-8') as f: print >>f, extr_text doc.words = self.extract_words(extr_text) doc.n_words = len(doc.words) paragraph = self.re_set_paragraph.sub(u' ', extr_text) sentences, poss = self.SS.predict(paragraph) # with codecs.open(u'data/outs.txt', 'a',encoding='utf-8') as f_out: # print >>f_out, u'\n\n-----\n', u','.join( [ str(p) for p in poss ] ) # # print >>f_out, u','.join( [ str(p) for p in poss2 ] ) # for s in sentences: # print >>f_out, s, '\n' poss = [0] + list(poss) doc.sentences = [ (poss[i], poss[i+1], sentences[i]) for i in xrange(len(poss)-1) ] doc.n_sentences = len(doc.sentences) indexed_docs.append(doc) return indexed_docs
def get_articles(url): doc = urllib.request.urlopen(url) docContent = BeautifulSoup(doc, 'html.parser') articles = [] for element in docContent.find_all('div'): try: if element.attrs['style'] == 'width:550px': article = defaultdict(str) article_link = 'http://www.moneycontrol.com' + element.a['href'] for p in element.find_all('p'): if 'a_10dgry' in p.attrs['class']: article_time = p.contents[0].split('|')[0] article_date = p.contents[0].split('|')[1][:-1] article['link'] = article_link article['time'] = article_time article['date'] = article_date extractor = Extractor(extractor='ArticleExtractor', url=article_link) article['content'] = extractor.getText() article['title'] = BeautifulSoup(extractor.getHTML(), 'html.parser').find_all('h1')[0].contents[0] articles.append(article) break except: logging.debug('div has no width attribute') return articles
def extract_blog_posts(url_string, PAGES = 48): blog_posts = [] page_count = 0 while(page_count<=PAGES): page_count+=1 url = url_string.format(page_count) # create url driver.get(url) try: article = driver.find_elements_by_tag_name('article') articles_size = len(article) print 'processing ', url except SocketError as e: if e.errno != errno.ECONNRESET: raise # Not error we are looking for continue for i in xrange(articles_size): headers = article[i].find_elements_by_tag_name("header") for header in headers: article_a = header.find_elements_by_xpath("//h1/a[@title]") print 'extracting ...' for e in article_a: extractor = Extractor(extractor = 'ArticleExtractor', url = e.get_attribute('href')) texts = extractor.getText() blog_posts.append({'title': e.text, 'content': clean_html(texts), 'link': e.get_attribute('href')}) return blog_posts
def parse_item(self, response): response_news = NewsItem() response_news['url'] = response.url response_news['html'] = Binary(zlib.compress(response.body, 9)) extractor = Extractor(extractor='ArticleExtractor', html=response.body) response_news['content'] = extractor.getText() return response_news
def process_text(self, text): if text == "": return text extractor = Extractor(extractor='ArticleExtractor', html=text) new_val = extractor.getText() return new_val
def parse(self, response): hxs = Selector(response) item = ArticleItem() item["title"] = hxs.xpath('//title/text()').extract() item["link"] = response.url item["source"] = hxs.xpath('//p').extract() extractor = Extractor(extractor='ArticleExtractor', url=item["link"]) source = extractor.getHTML() item["text"] = extractor.getText() item["html"] = source page = html.fromstring(source) links = page.xpath("//p//a/@href") linkPattern = re.compile("^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+))?$") for link in links: if linkPattern.match(link) and not link in self.crawled_links: self.crawled_links.append(link) yield Request(link, self.parse) yield item
def run(self): count = 0 docCount = self.doc_cursor.count() for doc in self.doc_cursor: url = doc['url'] if (self.keepText(url)): try: extractor = Extractor(extractor='ArticleExtractor', url=url) extracted_text = extractor.getText() if (len(extracted_text) > 0): title = extractor.getTitle() if title != None: doc['title'] = title doc['extracted_text'] = title + " " + extracted_text else: doc['extracted_text'] = extracted_text self.db_collection.save(doc) print 'OK -' + url except IOError, err: print "IOError with url " + url print str(err) except (LookupError): print "LookupError - Maybe not text or weird encoding " + url except (UnicodeDecodeError, UnicodeEncodeError): print "UnicodeDecodeError or UnicodeEncodeError- " + url
def get_text(url): from boilerpipe.extract import Extractor try : extractor = Extractor(extractor='DefaultExtractor', url=url) return extractor.getText(), extractor.getHTML() except: return "",""
def extract_article(url): r = requests.get(url) # the the url exists, continue if r.status_code == 200: # extract and parse response url url = parse_url(r.url) # extract html html = r.content.decode('utf-8', errors='ignore') # run boilerpipe BP = Extractor(html=html) # run readability Rdb = Document(html) html = Rdb.summary() # return article data return { 'extracted_title': Rdb.short_title().strip(), 'extracted_content': strip_tags(BP.getText()), } # otherwise return an empty dict else: return {}
def GOOGLE_get_data(company): google_news_rss_url = "https://news.google.com/news/?q=%s&output=rss" % company rss_feed = feedparser.parse(google_news_rss_url) content_list = list() for entry in rss_feed['entries']: title = entry['title'] link = entry['link'] try: news_page = urllib2.urlopen(link).read() extractor = Extractor(extractor='ArticleExtractor', html=news_page) except: continue content = extractor.getText() now = datetime.datetime.now() content_list.append({"title": title, "article": content, "link": link, "source": "GOOGLE", "target": company, "date": "%04d%02d%02d" % (now.year, now.month, now.day), "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()}) DBOperation.save_db(content_list)
def extract_and_save(url, path): try: handle = urllib2.urlopen(url) html_content = handle.read() extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content) text = extractor.getText() if text: if detect_english(text): links = get_all_urls(html_content, url) for link in links: try: handle = urllib2.urlopen(url) html_content = handle.read() #extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content) #text_content = extractor.getText() #if text_content: # if detect_english(text_content): encoded_url = encode(link) f = open(path + "/" + encoded_url, "w") f.write(html_content) f.close() except: print url traceback.print_exc() return None except: print url traceback.print_exc() return None
def extractor(URL): extractor = Extractor(extractor='ArticleExtractor', url=URL) data = extractor.getText() file = open("data.txt", "w") file.write(data.encode('UTF-8')) file.close() #Scinde la contenu en phrase with open('data.txt', 'r') as f: s = f.read() sentences = s.split('.') #Liste de mot vide w=[] #Scinde les phrase en mots for sentence in sentences : w.extend(sentence.split(' ')) print w #Retourne la liste de Mot return w
def download_article_file(articleURL, articleFileDirectory, code): articleFilePath = articleFileDirectory + code # Download the article and save as file if (articleURL == ""): print "ERROR: Empty URL detected! File not created" return None else: # If a directory for files doesn't exist, create it dir = os.path.dirname(articleFileDirectory) if not os.path.isdir(dir): #print "Created directory: " + dir os.makedirs(dir) try: #fullArticle = urllib2.urlopen(articleURL) #fullArticleText = fullArticle.read() # Use boilerpipe to remove boilerplate and formatting extractor = Extractor(extractor='ArticleExtractor', url=articleURL) fullArticleText = extractor.getText() # Test to see if article is in English. If not, then return None top_language = cld.detect(fullArticleText.encode('utf-8'))[0] if (top_language != 'ENGLISH'): print "SKIPPED: Article is in " + top_language return None outfile = open(articleFilePath, 'w+') outfile.write(fullArticleText.encode('ascii', 'ignore')) outfile.close # Use lxml's HTML cleaner to remove markup #htmltree = lxml.html.fromstring(fullArticleText) #cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=True) #cleaned_tree = cleaner.clean_html(htmltree) #return cleaned_tree.text_content() return fullArticleText except urllib2.HTTPError: print "ERROR: HTTPError. Article file download skipped: " + articleURL return None except urllib2.URLError: print "ERROR: URLError. Article file download skipped: " + articleURL return None except LookupError: print "ERROR: LookupError. Article file download skipped: " + articleURL return None except UnicodeDecodeError: print "ERROR: UnicodeDecodeError. Article file download skipped: " + articleURL return None except: print "ERROR: ", sys.exc_info()[0] return None
def detag_html_file(infile, outfile, id): from boilerpipe.extract import Extractor if not USE_BOILERPLATE: return detag_html_file_bs(infile, outfile, id) tempfile = "%s.tmp.html" % (infile,) # boilerplate seems to need an html extension try: copyfile(infile, tempfile) extractor = Extractor(extractor='ArticleExtractor', url="file://"+tempfile) os.unlink(tempfile) extracted_text = extractor.getText() extracted_html = extractor.getHTML() soup = BeautifulSoup(extracted_html) output = codecs.open(outfile, encoding='utf-8', mode='w') output.write(u"<DOC>\n<DOCNO>" + unicode(id) + u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n"); head = soup.find('head') if head: title_tag = head.find('title') if title_tag and title_tag.string: output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') + u"</TITLE>\n") extract_para(soup, output) output.write(u"</DOC>\n") output.close() except Exception, exc: try: os.unlink(tempfile) except: pass return detag_html_file_bs(infile, outfile, id)
def post_index(post): extractor = Extractor(extractor='ArticleExtractor', url=post['href']) post_text = extractor.getText().replace('\n', ' ') url = 'http://localhost:9200/bookmarks/bookmark/%s/_create' % post['hash'] data = '{"title":"%s", "url":"%s", "text":"%s"}' % (post['description'], post['href'], post_text.replace('"', '\\"')) r = requests.put(url, data=data) print r.status_code
def parse_page(self, response): if response.meta.has_key('crawldepth'): depth = response.meta['crawldepth'] else: # Set search depth here depth = 1 log.msg('Depth = %s' % str(depth), level=log.INFO) if not isinstance(response, HtmlResponse): log.msg('Not an HTML file: %s' % response.url, level=log.WARNING) return log.msg('Response from: %s' % response.url, level=log.INFO) url_bf.add(response.url) # TODO: Extract page title extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode()) cleaned_text = extractor.getText() # Eliminate duplicates keywordset = set(keywordlist) found_list = [] for keyword in keywordset: # TODO: Is there a more efficient way to do this? # Look at word boundaries to match entire words only if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)): found_list.append(keyword) # Parse this page item = BiffleItem() if (len(found_list) > 0): item['url'] = response.url item['body'] = cleaned_text item['keywords'] = ', '.join(found_list) item['process_date'] = datetime.today() log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO) self.map_keyword_count(found_list) yield item if (depth > 0): # Find the next requests and yield those hxs = HtmlXPathSelector(response) links = hxs.select('//a/@href').extract() log.msg('Links on page: %s' % len(links), level=log.INFO) depth -= 1 log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO) for l in links: l = urlparse.urljoin(response.url, l) if (l in url_bf): pass #log.msg('Duplicate URL found: %s' % l, level=log.INFO) else: url_bf.add(l) #log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO) # Decrement depth for next layer of links #callback = lambda response, depth = depth: self.parse_page(response, depth) callback = lambda response: self.parse_page(response) request = Request(l, callback=callback) request.meta['crawldepth'] = depth yield request
def main(): parser = argparse.ArgumentParser() parser.add_argument("raw_dir_path") parser.add_argument("out_file_path") args = parser.parse_args() f_names = [(int(f), f) for f in listdir(args.raw_dir_path)] f_names = sorted(f_names) fout = open(args.out_file_path, 'w') for int_f_name, f_name in f_names: trec_reader = TrecReader(join(args.raw_dir_path, f_name)) empty_cnt = 0 err_cnt = 0 for docno, html_text in trec_reader: if not html_text: empty_cnt += 1 try: extractor = Extractor(extractor='ArticleExtractor', html=html_text) text = extractor.getText() text = text.replace('\n', ' ').replace('\t', ' ') text = text.encode('ascii', 'ignore') text = text_clean(text) if text: fout.write(docno + '\t' + text + '\n') else: empty_cnt += 1 except Exception as e: err_cnt += 1 fout.close() print empty_cnt, err_cnt
def get_text_boilerpipe(html_text): try: extractor = Extractor(extractor='ArticleExtractor', html=html_text) return extractor.getText() except: print "Exception" return None
def Process(DocIn,OutName): out = open(OutName,'w') logging.info('reading [%s]', DocIn) ErrCnt = 0 EmptyCnt = 0 for cnt,line in enumerate(open(DocIn)): vCol = line.strip().split('\t') DocNo = vCol[0] RawHtml = ' '.join(vCol[1:]) RawHtml = DiscardHTMLHeader(RawHtml) if "" == RawHtml: EmptyCnt += 1 continue try: extractor = Extractor(extractor='ArticleExtractor',html=RawHtml) text = extractor.getText() text = text.replace('\n',' ').replace('\t',' ') text = text.encode('ascii','ignore') text = TextClean(text) if "" != text: print >>out, DocNo + '\t' + text else: EmptyCnt += 1 # print DocNo + '\t' + text.encode('ascii','ignore') except Exception as e: ErrCnt += 1 if 0 == (cnt % 100): logging.info('parsed [%d] doc [%d] Err [%d] Empty', cnt,ErrCnt,EmptyCnt) out.close() logging.info('finished [%d] doc [%d] Err', cnt,ErrCnt)
def dehydrate(self, bundle): """GET Method""" #print bundle.data['content'] if bundle.data['content']: extractor = Extractor(extractor='ArticleExtractor', html=bundle.data['content']) bundle.data['content'] = extractor.getText() try: article_stats = ArticleStat.objects.filter(article_id=bundle.obj.id) bundle.data['stat'] = { 'reads': sum(map(lambda x: x.reads, article_stats)), 'likes': sum(map(lambda x: x.likes, article_stats)), 'dislikes': sum(map(lambda x: x.dislikes, article_stats)), 'shares': sum(map(lambda x: x.shares, article_stats)), } except ObjectDoesNotExist: bundle.data['stat'] = { 'reads': 0, 'likes': 0, 'dislikes': 0, 'shares': 0, } # no cookies or no sessionid field in cookies, then just send normal # newsfeed to anonymous user #if not bundle.request.COOKIES or not bundle.request.COOKIES['sessionid']: if not bundle.request.COOKIES or not 'sessionid' in bundle.request.COOKIES: return bundle try: # even if there is a cookie, sessionid field might be not exist, # then it is also anonymous user s = get_current_session(bundle.request.COOKIES['sessionid']) if s is None or 'user_id' not in s: return bundle # get activity information whether user has already # read/liked/shared activity = Activities.objects.get(user_id=s['user_id'], \ article_id=bundle.obj.id) # assign information bundle.data['activity'] = { 'read': activity.like or activity.share, 'like': activity.like, 'dislike': activity.dislike, 'share': activity.share } except ObjectDoesNotExist: # assign False if the news has never been opened bundle.data['activity'] = { 'read': False, 'like': False, 'dislike': False, 'share': False } return bundle
def extract_text(html_content): try: extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content) #print extractor.getText() return extractor.getText() except: print "Exception in html extraction" return None
def extract_article(html_text): try: extractor = Extractor(extractor='ArticleExtractor', html=html_text) text_string = extractor.getText() text_string = htmlParser.unescape(text_string) except Exception: logger.error('Error extracting article html') text_string = '' return text_string
def get_news_by_url(url): print "Come to get_news_by_url" article = {} try: soup = BeautifulSoup(urllib2.urlopen(url)) "Get the title of News" title = "" titleElements = soup.findAll(id="disqus_title") for ele in titleElements: title = ele.getText().encode('utf-8') article["title"] = title print title "Get the posttime of News,Timezone ET" postTime = "" postTimeElements = soup.findAll(attrs={'class':"datestamp"}) for ele in postTimeElements: timeStamp = float(ele["epoch"]) postTime = datetime.fromtimestamp(timeStamp/1000) article["post_time"] = postTime "Initiate the post date" postDay = postTime.date() article["post_date"] = postDay; "Get the author information " author = "" authorElements = soup.findAll(attrs={'class':"byline"}) for ele in authorElements: author = ele.contents[0].strip().replace("By","").replace("-","").replace("and", ",").strip(); article["author"] = author "Get the content of article" extractor=Extractor(extractor='ArticleExtractor',url=url) content = extractor.getText().encode("utf-8") article["content"] = content "Initiate the Sources" source = "Bloomberg News" article["source"] = source "Initiate the update_time" updateTime = datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S") article["update_time"] = updateTime "Initiate the embers_id" embersId = hashlib.sha1(content).hexdigest() article["embers_id"] = embersId "settup URL" article["url"] = url except: print "Error: %s" %sys.exc_info()[0] article = {} finally: return article
def extract_article(html_text): try: extractor = Extractor(extractor='ArticleExtractor', html=html_text) text_string = extractor.getText() text_string = htmlParser.unescape(text_string) text_string = unicodedata.normalize('NFKD', text_string).encode('ascii','ignore') except Exception: print 'Error extracting article html' text_string = '' return text_string
def test_boilerpipe(): your_url = "http://stackoverflow.com/questions/9352259/trouble-importing-boilerpipe-in-python" extractor = Extractor(extractor='ArticleExtractor', url=your_url) extracted_html = extractor.getHTML() extracted_text = extractor.getText() print '\nfunction: %s ' % inspect.stack()[0][3] print 'extracted html: %i text: %i' % (len(extracted_html), len(extracted_text)) print '' n.assert_greater(len(extracted_text), min_str_length)
def html_to_text(html): try: extractor = Extractor(extractor='ArticleExtractor', html=html) except Exception as e: logger.exception('\nError extracting text from html. Exception: %s, %s', e.__class__.__name__, e) return '' text = extractor.getText() text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') return text
def fetch_articles(self): greq_gen = (grequests.get(u, headers=self.header,) for u in self.urls) responses = grequests.map(greq_gen) for i,res in enumerate(responses): if res is not None: extractor = Extractor(html=res.text) self.entries[i]['text'] = extractor.getText() if '...' in self.entries[i]['title']: self.entries[i]['title'] = extractor.getTitle() return True
def extract(self, article): try: extractor = Extractor(extractor='ArticleSentencesExtractor', url=article.url) except Exception as e: return '' article_text = '' try: article_text = extractor.getText() except Exception: pass return article_text.encode('utf-8')
def main(): contents = sys.argv[1] for url in listdir(contents): print url with codecs.open(url, "w", encoding="utf-8") as out: try: html = urlopen(url.replace("{", "/")).read() extracted = Extractor(html=html) out.write(extracted.getText()) except HTTPError: out.write("")
def boiler(): from boilerpipe.extract import Extractor for i in range(0, 1000): input_filename = 'page/' + str(i) + '.txt' output_filename = 'boilerpipe/' + str(i) + '.txt' input_file = open(input_filename, 'r') s = input_file.read() input_file.close() extractor = Extractor(extractor='ArticleExtractor', html=s.decode('GBK', 'ignore')) output_file = open(output_filename, 'wb') output_file.write(extractor.getText().encode('utf-8')) output_file.close()
def summarize(url=None, html=None, n=100, cluster_threshold=5, top_sentences=5): # Adapted from "The Automatic Creation of Literature Abstracts" by H.P. Luhn # # Parameters: # * n - Number of words to consider # * cluster_threshold - Distance between words to consider # * top_sentences - Number of sentences to return for a "top n" summary # Begin - nested helper function def score_sentences(sentences, important_words): scores = [] sentence_idx = -1 for s in [nltk.tokenize.word_tokenize(s) for s in sentences]: sentence_idx += 1 word_idx = [] # For each word in the word list... for w in important_words: try: # Compute an index for important words in each sentence word_idx.append(s.index(w)) except ValueError as e: # w not in this particular sentence pass word_idx.sort() # It is possible that some sentences may not contain any important words if len(word_idx)== 0: continue # Using the word index, compute clusters with a max distance threshold # for any two consecutive words clusters = [] cluster = [word_idx[0]] i = 1 while i < len(word_idx): if word_idx[i] - word_idx[i - 1] < cluster_threshold: cluster.append(word_idx[i]) else: clusters.append(cluster[:]) cluster = [word_idx[i]] i += 1 clusters.append(cluster) # Score each cluster. The max score for any given cluster is the score # for the sentence. max_cluster_score = 0 for c in clusters: significant_words_in_cluster = len(c) total_words_in_cluster = c[-1] - c[0] + 1 score = 1.0 * significant_words_in_cluster * significant_words_in_cluster / total_words_in_cluster if score > max_cluster_score: max_cluster_score = score scores.append((sentence_idx, score)) return scores # End - nested helper function extractor = Extractor(extractor='ArticleExtractor', url=url, html=html) # It's entirely possible that this "clean page" will be a big mess. YMMV. # The good news is that the summarize algorithm inherently accounts for handling # a lot of this noise. txt = extractor.getText() sentences = [s for s in nltk.tokenize.sent_tokenize(txt)] normalized_sentences = [s.lower() for s in sentences] words = [w.lower() for sentence in normalized_sentences for w in nltk.tokenize.word_tokenize(sentence)] fdist = nltk.FreqDist(words) top_n_words = [w[0] for w in fdist.items() if w[0] not in nltk.corpus.stopwords.words('english')][:n] scored_sentences = score_sentences(normalized_sentences, top_n_words) # Summarization Approach 1: # Filter out nonsignificant sentences by using the average score plus a # fraction of the std dev as a filter avg = numpy.mean([s[1] for s in scored_sentences]) std = numpy.std([s[1] for s in scored_sentences]) mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > avg + 0.5 * std] # Summarization Approach 2: # Another approach would be to return only the top N ranked sentences top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-top_sentences:] top_n_scored = sorted(top_n_scored, key=lambda s: s[0]) # Decorate the post object with summaries return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored], mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
def bp_extract(url): extr = Extractor(extractor='ArticleExtractor', url=url) text = extr.getText() print(text)
def textify(html_text, extractor="raw", encoding="UTF8"): if not isinstance(html_text, unicode): try: html_text_unicode = unicode(html_text, encoding) except UnicodeDecodeError: try: html_text_unicode = unicode(html_text, 'utf-8') except UnicodeDecodeError: try: html_text_unicode = unicode(html_text, 'iso-8859-1') except UnicodeDecodeError: try: html_text_unicode = unicode(html_text, 'cp1252') except UnicodeDecodeError as e: print "ERROR conv to unicode", e else: html_text_unicode = html_text if not html_text_unicode: return "" if extractor.lower() != "raw": try: from boilerpipe.extract import Extractor bp = Extractor(extractor=extractor, html=html_text_unicode) return bp.getText() except Exception: try: bp = Extractor(extractor=extractor, html=html_text_unicode) return bp.getText() except Exception as e: sys.stderr.write( "ERROR running %s boilerpipe on %s:\n%s: %s\n" % (extractor, html_text, type(e), e)) return "" del bp else: text = html_text_unicode ### Entity Nonsense from A. Swartz's html2text http://www.aaronsw.com/2002/html2text/html2text.py ### def name2cp(k): if k == 'apos': return ord("'") if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] else: k = htmlentitydefs.entitydefs[k] if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 return ord(codecs.latin_1_decode(k)[0]) def charref(name): if name[0] in ['x', 'X']: c = int(name[1:], 16) else: c = int(name) try: return unichr(c) except NameError: #Python3 return chr(c) def entityref(c): try: name2cp(c) except KeyError: return "&" + c + ';' else: try: return unichr(name2cp(c)) except NameError: #Python3 return chr(name2cp(c)) def replaceEntities(s): s = s.group(1) if s[0] == "#": return charref(s[1:]) else: return entityref(s) r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") def unescape(s): s = s.replace(' ', ' ') return r_unescape.sub(replaceEntities, s) ### End Entity Nonsense ### re_clean_comments = re.compile(r'<!--.*?-->', re.I | re.DOTALL) re_clean_javascript = re.compile(r'<script[^>]*/?>.*?</script>', re.I | re.DOTALL) re_clean_style = re.compile(r'<style[^>]*/?>.*?</style>', re.I | re.DOTALL) re_clean_balises = re.compile(r'<[/!?]?\[?[a-z0-9\-]+[^>]*>', re.I | re.DOTALL) #re_clean_blanks = re.compile(r'[ \s]+') re_clean_blanks = re.compile(r'[ \t\f\v]+') re_clean_multiCR = re.compile(r'( ?[\n\r]+)+', re.M) try: text = unescape(text) text = re_clean_blanks.sub(' ', text) text = re_clean_comments.sub(' ', text) text = re_clean_javascript.sub(' ', text) text = re_clean_style.sub(' ', text) text = re_clean_balises.sub(' ', text) text = re_clean_blanks.sub(' ', text).strip() text = re_clean_multiCR.sub('\n\r', text) except: pass return text
def unpack_line(line): line = string.replace(line, " ", " ") els = string.split(line, " ") url = els[2] categories = els[1] number = els[0] return number, categories, url file_base = open('Main_base.txt', 'r') Line = file_base.readlines() file_extract = open('Dbase_exctract.txt', 'w') for line in Line: try: number1, adress1, sites1 = unpack_line(line) number = number1.strip("\n") adress = adress1.strip("\n") sites = sites1.strip("\n") extractor = Extractor(extractor='ArticleExtractor', url=sites) file_ex = open( 'ExtractSites/' + adress + '/' + number + 'forclass.txt', 'w') file_ex.write(extractor.getText().encode("UTF-8")) file_ex.close() file_extract.write(line) except: print line file_base.close() file_extract.close()
with open('finalurls.txt') as fp: createDirectories() with open('output/hashedUrls.csv', 'w') as csvfile: fieldnames = ['url', 'key'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for line in fp: md5value = hashlib.md5(line.strip().encode('utf-8')).hexdigest() writer.writerow({'url': line.strip(), 'key': md5value}) print(count, ":", md5value) count = count + 1 try: rawhtml = urllib.request.urlopen(line.strip()).read() with open('output/rawHtml/%s.html' % md5value, 'w+', encoding='utf-8') as rawf: print(rawhtml, file=rawf) extractor = Extractor(extractor='ArticleExtractor', html=rawhtml) htmlText = extractor.getText() with open('output/processedHtml/%s.txt' % md5value, 'w+', encoding='utf-8') as processedf: print(htmlText, file=processedf) # print(htmlText) except KeyboardInterrupt: exit() except: pass
def extract_rss_articles(rss): new_entries_inserted = 0 try: #rss parser rss_feed = feedparser.parse(rss) except: logging.warn('Warn: Parsing failed for rss source={}'.format(rss)) return 0 for entry in rss_feed['entries']: #title extracted if 'title' in entry.keys(): title = entry.title else: continue #link extracted if 'link' in entry.keys(): link = entry.link source = link.split("//")[-1].split("/")[0] else: continue id = hashlib.md5((title + link).encode("utf-8")).hexdigest() if new_id(id): #date of publish of article extracted if 'published_parsed' in entry.keys(): published_date = entry.published_parsed published_date = datetime.fromtimestamp( mktime(published_date)).isoformat() published_date = published_date.split("T")[0] else: published_date = "0000-00-00" #print(published_date) #summary of article extracted if 'summary' in entry.keys(): summary = entry['summary'] else: summary = "" TAG_RE = re.compile(r'<[^>]+>') summary = TAG_RE.sub('', summary) #extract full content of article content = "" if rss != "https://services.india.gov.in/feed/rss?cat_id=12&ln=en": if rss == "http://goidirectory.nic.in/rss/minstry_rss.php?categ_id=1": try: response = requests.get(link) paragraphs = justext.justext( response.content, justext.get_stoplist("English")) for paragraph in paragraphs: if not paragraph.is_boilerplate: content = content + paragraph.text except: content = "" else: try: extractor = Extractor( extractor='ArticleSentencesExtractor', url=link) content = extractor.getText() except: content = "" else: content = summary if content == "" or content == "unknown": continue #insert article into database try: cursor.execute('use main_database') cursor.execute( 'insert english_database values (%s,%s,%s,%s,%s,%s,%s)', (id, published_date, title, link, source, summary, content)) logging.info( 'Info: New Article pushed into database from {}'.format( source)) conn.commit() print("Article Fetched") except Exception as error: logging.info( 'Warn: Article cannot be pushed from source {}, error={}'. format(source, error)) continue #insert the link of this article into viewed_links.txt, since it has been viewed with open('viewed_articles_ids.txt', 'a') as f: f.write('{}\n'.format(id)) new_entries_inserted = new_entries_inserted + 1 print("rss source processed") #return count of new entries inserted into database return new_entries_inserted
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import with_statement import sys import os from boilerpipe.extract import Extractor sys.path.insert(0, os.path.abspath('..')) from clint import args if __name__ == '__main__': html_file = args.get(0) html = open(html_file).read() extractor = Extractor(extractor='ArticleExtractor', html=html) print extractor.getText().encode('utf-8')
def parse_readings(): """ Reads from list generated by parse_course(). Reads each readings page from scrape_readings(). Parses the HTML. Writes to JSON. """ # Use the json list of readings if it exists try: with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile: readings = json.loads(jsonfile.read()) # otherwise, generate it except FileNotFoundError: parse_course() with open('%s/%s' % (data_dir, links_file), 'r') as jsonfile: readings = json.loads(jsonfile.read()) # Create lists to hold readings reading_list = [] pdf_list = [] error_list = [] for reading in readings: # Skip pdf files if '.pdf' in reading['url']: pdf_list.append(reading) else: # Container for parsed data reading_item = {} # Use goldfinch to make a valid filename from the URL filename = vfn(reading['url'], initCap=False).decode() # Initialize a newspaper article # url is empty because we don't need newspaper to do any scraping # but it's a required property article = Article(url='') # Open the local version of the HTML file try: with open('%s/%s/%s' % (data_dir, readings_html_dir, filename), 'r') as htmlfile: # Save both the raw html and add it to the article raw_html = htmlfile.read() article.set_html(raw_html) except FileNotFoundError: print('Error reading saved html file') # Use newspaper to do the parsing article.parse() reading_item['title'] = article.title reading_item['authors'] = article.authors # Set iso string version of date if it exists. # It needs to be a string because we'll be exporting to JSON reading_item['pub_date'] = article.publish_date.isoformat() \ if article.publish_date else None # Usually newspaper's extractor works best reading_item['n_text'] = article.text # But when it fails, we may want to use boilerpipe extraction as # a fallback extractor = Extractor(extractor='ArticleExtractor', html=raw_html) reading_item['b_text'] = extractor.getText() # print('Newspaper words: %s' % len(reading_item['n_text'].split())) # print('Boilerpipe words: %s' % len(reading_item['b_text'].split())) # if(reading_item['text'] == ''): # extractor = Extractor(extractor='ArticleExtractor', html=raw_html) # reading_item['text'] = extractor.getText() # Add the parsed data to our existing reading data reading['page'] = reading_item # Note failed parses if (reading_item['n_text'] == '' and reading_item['b_text'] == ''): print('Could not parse text for %s' % reading['url']) error_list.append(reading) else: reading_list.append(reading) print('Sucessfully parsed readings: %s' % len(error_list)) print('Skipped PDF readings: %s' % len(pdf_list)) print('Articles without parseable text: %s' % len(error_list)) # print('Articles without authors: %s' % len([ # reading for reading in reading_list # if reading['page']['authors'] == []])) # print('Articles without dates: %s' % len([ # reading for reading in reading_list # if reading['page']['pub_date'] is None])) # Write to json file with open('%s/%s' % (data_dir, readings_file), 'w') as jsonfile: jsonfile.write(json.dumps(reading_list))
class WebStatic: def __init__(self): self.URL = '' self.extractor = '' def setUrl(self, URL): self.URL = URL def getTextWeb(self): self.extractor = Extractor(extractor='KeepEverythingExtractor', url=self.URL) return self.extractor.getText() def getArticleText(self): self.extractor = Extractor(extractor='ArticleExtractor', url=self.URL) return self.extractor.getText() def getNews(self): self.extractor = Extractor(extractor='KeepEverythingExtractor', url=self.URL) buffer = list(self.extractor.getText().split(' ')) buffer_two = [] isnews = False pattern = '.\s\d\d.\d\d.\d{4}' for item in list(buffer): item = str(item).split() item = ' '.join(item) if re.search(pattern, item): isnews = True item = str(item).split(' ') buffer_two.append(item[0]) item = item[1] if item == '': isnews = False if isnews: buffer_two.append(item) buffer_two.pop(0) pattern_year = '\d\d.\d\d.\d{4}' self.news = [] newses = '' isnew = False for item in buffer_two: if re.search(pattern_year, item): newses = newses.replace('!', '').replace(',', '').replace( '«', '').replace('»', '').replace(':', '').replace('–', ' ') self.news.append(newses) newses = '' isnew = True continue if isnew: newses = newses + '' + item isnew = False else: newses = newses + ' ' + item self.news.pop(0) return self.news def getRelevantNews(self): # Определите здесь свой запрос QUERY_TERMS = ['стол', 'кубка', 'регион'] # получаем массив новостей self.news = self.getNews() # Textcollection определяет абстракции tf, idf и tf_idf, # поэтому нам не требуется определять свои версии tc = nltk.TextCollection(self.news) relevant = [] for idx in range(len(self.news)): score = 1 for term in [t.lower() for t in QUERY_TERMS]: score += tc.tf_idf(term, self.news[idx]) if score > 0: relevant.append({'score': score, 'title': self.news[idx]}) # Сортировать результаты по релевантности и выводим relevants = sorted(relevant, key=lambda p: p['score'], reverse=True) for post in relevants: print('{0}'.format(post['title'])) return relevants def getCollocation(self): # Число искомых словосочетаний N = 10 all_tokens = [ token for post in self.news for token in post.lower().split() ] for word in self.news: all_tokens.append(word.lower()) finder = nltk.BigramCollocationFinder.from_words(all_tokens) finder.apply_freq_filter(2) finder.apply_word_filter( lambda w: w in nltk.corpus.stopwords.words('english')) scorer = association.BigramAssocMeasures.jaccard collocations = finder.nbest(scorer, N) for collocation in collocations: c = ' '.join(collocation) print(c) def getMatrixDiag(self): vector = TfidfVectorizer(analyzer='word', norm=None, use_idf=True, smooth_idf=True) tfIdf = vector.fit_transform(self.news) sim = cosine_similarity(tfIdf, tfIdf) newsList = [] x = 1 for i in self.news: newsList.append(str(x)) x = x + 1 simDf = pd.DataFrame(sim, index=sorted(newsList), columns=sorted(newsList)) f = plt.figure(figsize=(19, 15)) plt.matshow(simDf.corr(), fignum=f.number) plt.xticks(range(simDf.shape[1]), simDf.columns, fontsize=14, rotation=45) plt.yticks(range(simDf.shape[1]), simDf.columns, fontsize=14) cb = plt.colorbar() cb.ax.tick_params(labelsize=14) plt.title('Косинусное сравнение новостей', fontsize=16) plt.show() print(simDf)
def bpLargGetText(self): extractor = Extractor(extractor='LargestContentExtractor', url=self.url) extracted = extractor.getText() return extracted
def bpArtGetText(self): extractor = Extractor(extractor='ArticleExtractor', url=self.url) extracted = extractor.getText() return extracted
linksFile = open('1000TwitterLinks.txt','r') for link in linksFile: if(link == ''): pass else: try: curlCommand = 'curl ' + link hash_object = hashlib.md5(link) print(hash_object.hexdigest() + '.html') htmlFile = hash_object.hexdigest() + ':htmlFile' textFile = hash_object.hexdigest() + ':txt' f = open(htmlFile, "w") raw_html = subprocess.call(curlCommand, shell=True, stdout=f) extractor = Extractor(extractor='ArticleExtractor', url=link) with open(textFile, 'w') as the_file: the_file.write(str(extractor.getText())) linksDict[textFile] = link print str(extractor.getText()) except KeyboardInterrupt: exit() except: pass with open('textURLFile', 'w') as file: for key,value in linksDict.items(): file.write('%s:%s\n' % (key, value))
def extract_main_text(html_text): extractor=Extractor(extractor='ArticleExtractor',html=html_text) extracted_text=extractor.getText() return extracted_text
print(hash_object.hexdigest() + '.html') htmlFile = os.path.join("sourceHTML_data", hash_object.hexdigest()+ "':html'") textFile = os.path.join("sourceTXT_data", hash_object.hexdigest()+ "':txt'") #file_to_open = os.path.join(data_folder, "raw_data.txt") #htmlFile = hash_object.hexdigest() + ':html' #textFile = hash_object.hexdigest() + ':txt' extractor = Extractor(extractor='ArticleExtractor', url=link) #print (str(extractor.getText())) if (len(str(extractor.getText())) > 0): #open(htmlFile, "w") f = open(htmlFile, "w") raw_html = subprocess.call(curlCommand, shell=True, stdout=f) #htmlFile.write(str(extractor.getHTML())) with open(textFile, 'w') as the_file: the_file.write(str(extractor.getText())) linksDict[textFile] = link print (str(extractor.getText())) #linksDict[html] = link else: print("yes") except KeyboardInterrupt: exit()
def main(): data = get_train() + get_test() f = file('generated/extracted_text', 'w') for i, item in enumerate(data): # status update if (i % 500) == 0: print i, datetime.datetime.now().time() # parse file data = {} soup = boil_soup(item['urlid']) # given boilerplate data['boilerplate'] = [item['title'], item['body']] # extract text extractor = Extractor(extractor='ArticleExtractor', html=unicode(soup)) data['boilerpipe'] = [extractor.getText()] # remove non-text tags for tag in ['script', 'style']: for el in soup.find_all(tag): el.extract() # extract text for each tag for tag in TAGS: items = [] for el in soup.find_all(tag): el.extract() if tag == 'img': try: items.append(el['alt']) except KeyError: pass try: items.append(el['title']) except KeyError: pass else: items.append(el.text) data[tag] = items # extract meta tags meta = soup.find_all('meta') for el in meta: prop = el.get('property') if el.get('property') else el.get('name') if not prop: continue prop = prop.lower() try: s = unicode(el['content']) except: continue data['meta-' + prop] = s.split(u',') if prop == 'keywords' else [s] # preprocess string for item in data: data[item] = map(clean_string, data[item]) data[item] = filter(None, data[item]) print >> f, json.dumps(data) f.close()
from boilerpipe.extract import Extractor from urllib.parse import urlparse import glob import os path = 'C:\\RawHtmls/*.txt' urlCounter = 0 files = glob.glob(path) for file in files: try: urlCounter = urlCounter + 1 f = open(file, 'r') fileName = fileName = "C:\\ProcessedText\\" + os.path.basename(f.name) currentHtml = f.read() f.close() extractor = Extractor(extractor='ArticleExtractor', html=currentHtml) currentText = extractor.getText() output_file = open(fileName, "w") output_file.write(str(currentText.encode("utf-8"))) output_file.close() print("Download Completed : " + fileName) except: print('Error :', urlCounter)
if count == END: break if line[0] == "=": if found == False and count >= START: print("Critical Error:"+title+" has no url that passed the filter!") log.write(title+"\n") title = line.strip('\n').strip("=") count += 1 found = False else: if not found and count >= START: if check(line,KEY_WORDS,NEGATIVE_KEY_WORDS): output = open(OUTPUT_KEYWORD+title+".txt",'w') try: extractor = Extractor(extractor='DefaultExtractor', url=line) txt = extractor.getText().encode('utf-8') print(len(txt)) if len(txt) > 2500: output.write(txt) output.close() if len(txt) < 4000: print("Succeed,collecting another policy:"+title) title += "*" else: found = True print("Succeeded:"+title) except: print("Error:"+title+" request failed")
#f = open(html, 'r') #html = f.read() # #print html # #DefaultExtractor = Extractor(extractor='DefaultExtractor', html=html) #print "DefaultExtractor:\n" + DefaultExtractor.getText() + "\n" # #ArticleSentencesExtractor = Extractor(extractor='ArticleSentencesExtractor', html=html) #print "ArticleSentencesExtractor:\n" + ArticleSentencesExtractor.getText() + "\n" #DefaultExtractor = Extractor(extractor='DefaultExtractor', url=url) #print "DefaultExtractor:\n" + DefaultExtractor.getText() + "\n" # ArticleExtractor = Extractor(extractor='ArticleExtractor', url=url) print "ArticleExtractor:\n" + ArticleExtractor.getText() + "\n" ArticleSentencesExtractor = Extractor(extractor='ArticleSentencesExtractor', url=url) print "ArticleSentencesExtractor:\n" + ArticleSentencesExtractor.getText( ) + "\n" #KeepEverythingExtractor = Extractor(extractor='KeepEverythingExtractor', url=url) #print "KeepEverythingExtractor:\n" + KeepEverythingExtractor.getText() + "\n" # ##KeepEverythingWithMinKWordsExtractor = Extractor(extractor='KeepEverythingWithMinKWordsExtractor', url=url) ##print "KeepEverythingWithMinKWordsExtractor:\n" + KeepEverythingWithMinKWordsExtractor.getText() + "\n" # #LargestContentExtractor = Extractor(extractor='LargestContentExtractor', url=url) #print "LargestContentExtractor:\n" + LargestContentExtractor.getText() + "\n" #
with open(name + '.pkl', 'wb') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) def load_obj(name): with open(name + '.pkl', 'rb') as f: return pickle.load(f) path = 'content/' texts = {} print('Start: ' + parser_type) train_data = pd.read_csv('train_groups.csv', dtype=np.int16) for filename in tqdm(listdir(path)): doc_id = int(filename.strip('.dat')) if doc_id not in train_data.doc_id.values: continue with codecs.open(path + filename, 'r', 'utf-8') as f: url = f.readline().strip() html = f.read() extractor = Extractor(extractor=parser_type, html=html) s = extractor.getText() s = s.replace('\n', " ") s = s.replace('\t', " ") s = s.replace('\r', " ") texts[doc_id] = s train_data['text'] = train_data.apply(lambda row: texts[row.doc_id], axis=1) save_obj(train_data, 'train_data' + parser_type)
def test_extraction(): extractor = Extractor(extractor='ArticleExtractor', url='http://paulgraham.com/startupideas.html') print 'extractor created' print extractor.getText()
def extraction(link): extractor = Extractor(extractor='ArticleExtractor', url=link) extracted_text = extractor.getText() if extracted_text != "" or extracted_text != None: news_text.append(extracted_text)
from boilerpipe.extract import Extractor if __name__ == '__main__': URL = 'http://programmingisterrible.com/post/112612689998/san-francisco-for-londoners' extractor = Extractor(extractor='ArticleExtractor', url=URL) print extractor.getText()
import os from boilerpipe.extract import Extractor # creating directory using os library os.mkdir("processed") count = 1 while (count < 1001): with open('raw_html/%s.html' % count, 'r+', encoding='utf-8') as fp: #reading the collected html files from previous step extractor = Extractor(extractor='ArticleExtractor', html=fp.read()) #extracting non-html content processed = extractor.getText() with open('processed/%s.txt' % count, 'w', encoding='utf-8') as outfile1: outfile1.write(processed) count = count + 1
- DefaultExtractor - ArticleExtractor - ArticleSentencesExtractor - KeepEverythingExtractor - KeepEverythingWithMinKWordsExtractor - LargestContentExtractor - NumWordsRulesExtractor - CanolaExtractor """ url = 'https://techcrunch.com/2017/02/13/mit-speech-chip/' #BadStatusLine from boilerpipurle url = "http://www.forbes.com/sites/trevorclawson/2017/02/23/finding-a-voice-can-a-uk-startup-compete-with-its-heavy-hitters-in-the-speech-recognition-market/" url = "https://nakedsecurity.sophos.com/2017/03/03/researcher-uses-googles-speech-tools-to-skewer-google-recaptcha/" url = "http://www.natureworldnews.com/articles/32595/20161123/microsoft-officially-makes-first-humanly-accurate-speech-recognition-tech.htm" url = "http://www.businessinsider.com/ibm-edges-closer-to-human-speech-recognition-2017-3" #ArticleExtractor = Extractor(extractor='ArticleExtractor', url=url) #print "ArticleExtractor:\n" + ArticleExtractor.getText() + "\n" ArticleSentencesExtractor = Extractor(extractor='ArticleSentencesExtractor', url=url) print ArticleSentencesExtractor.getText() article = Goose().extract(url=url) print article.cleaned_text document = Document(requests.get(url)) document.content()
import q import requests from readability import Document url = 'https://news.cnblogs.com/n/624615/' url = 'https://tech.sina.com.cn/i/2019-04-29/doc-ihvhiqax5802337.shtml' url = 'http://forthxu.com/blog/article/73.html' url = 'http://forthxu.com/blog/article/91.html' url = 'http://forthxu.com/blog/article/gmail-sub-account.html' response = requests.get(url) doc = Document(response.content) print(doc.title()) s_html = doc.summary(True) print("s_html:", s_html) extractor = Extractor(extractor='ArticleExtractor', html=s_html) # extractor = Extractor(extractor='ArticleExtractor', url=url) extracted_text = extractor.getText() print("extracted_text:", extracted_text) # extracted_html = extractor.getHTML() q.d()
import asyncio from boilerpipe.extract import Extractor from helpers.extractors import extractors url = "https://dpstele.com" extractor = extractors["article_sentences"] ext = Extractor(extractor=extractor, url=url) print(ext.getText())
successful_text_list = [] unsuccessful_url_list = [] for url in url_list: try: r = requests.get(url, timeout=timeout) if r.status_code != 200: unsuccessful_url_list.append(url) continue html = r.text except: unsuccessful_url_list.append(url) continue try: extractor = Extractor(extractor='ArticleExtractor', html=html) text = extractor.getText().replace('\\', '').strip().replace( '\r', ' ').replace('\n', ' ') except: unsuccessful_url_list.append(url) continue successful_text_list.append(text) successful_url_list.append(url) # # write files # print() print('There were ' + str(len(unsuccessful_url_list)) + ' unsuccessful webpage downloads. These URLs are listed in ' + output_directory + '/unsuccessful_url_list.txt') f = open(output_directory + '/unsuccessful_url_list.txt', 'w')
def update_content_by_url(self): from boilerpipe.extract import Extractor extractor = Extractor(extractor='ArticleExtractor', url=self.url) self.content_html = extractor.getHTML() self.content_text = extractor.getText()
def getArticleProcItem(link): #request the url extractor = Extractor(extractor='ArticleExtractor', url=link) text = extractor.getText() return ProcessingItem(text)
def remove_boiler(htmlD): extractor = Extractor(extractor='DefaultExtractor', html=htmlD) text = extractor.getText().encode('ascii', 'ignore').decode('ascii') return text