def extractHTML(self, url=None, html=None, extractor='ArticleExtractor'): cherrypy.response.headers['Content-Type'] = "text/json" if url: extractor = Extractor(extractor=extractor, url=url) extracted_html = extractor.getHTML() return json.dumps({'url': url, 'extractedHTML': extracted_html}) elif html: extractor = Extractor(extractor=extractor, url=url) extracted_html = extractor.getHTML() return json.dumps({ 'html': html[:15], 'extractedHTML': extracted_html })
def traverse(outlet_url, num=3000): retrieved = 0 page = 1 while retrieved < num: if page < 2: page_url = outlet_url else: page_url = "{}page/{}".format(outlet_url, page) extractor = Extractor(extractor='KeepEverythingExtractor', url=page_url) html = extractor.getHTML() links = re.findall(r"<A\shref=\".*de\/(\d{4}\/\d{2}\/\d{2}\/.*)\" rel", html) for link in links: try: art_url = outlet_url + link extr = Extractor(extractor='ArticleExtractor', url=art_url) text = extr.getText() retrieved += 1 write_to_files(text, art_url, retrieved) print("Extracted {}: {}".format(retrieved, art_url)) except Exception as e: print(e) page += 1
def detag_html_file(infile, outfile, id): from boilerpipe.extract import Extractor if not USE_BOILERPLATE: return detag_html_file_bs(infile, outfile, id) tempfile = "%s.tmp.html" % (infile,) # boilerplate seems to need an html extension try: copyfile(infile, tempfile) extractor = Extractor(extractor='ArticleExtractor', url="file://"+tempfile) os.unlink(tempfile) extracted_text = extractor.getText() extracted_html = extractor.getHTML() soup = BeautifulSoup(extracted_html) output = codecs.open(outfile, encoding='utf-8', mode='w') output.write(u"<DOC>\n<DOCNO>" + unicode(id) + u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n"); head = soup.find('head') if head: title_tag = head.find('title') if title_tag and title_tag.string: output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') + u"</TITLE>\n") extract_para(soup, output) output.write(u"</DOC>\n") output.close() except Exception, exc: try: os.unlink(tempfile) except: pass return detag_html_file_bs(infile, outfile, id)
def Text_extractor(y, page, team, team_i, counter=0): """Extract the text of team pages using BoilerPipe.""" try: upage = urllib.parse.quote_plus(page) url = "http://" + y + ".igem.org/wiki/index.php?title=" + upage extractor = Extractor(extractor='ArticleExtractor', url=url) except: counter += 1 if counter > 10: print("Failed to get the text for page {}".format(page)) return None Text_extractor(y, page, team, team_i, counter=counter) f = open( 'results/%s/%s/%s_-_-_CONTENT.html' % (y, team, page.replace('/', '#')), 'w') f.write(extractor.getHTML()) f.close() f = open( 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')), 'w') f.write(extractor.getText()) f.close() path = 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')) # text = text.replace('\\n', '\\\\n') output = '%s\t%s\t%s\t%s\n' % (y, str(teams_id[team_i]), page, path) teams_pages_text_db.write(output)
def get_articles(url): doc = urllib.request.urlopen(url) docContent = BeautifulSoup(doc, 'html.parser') articles = [] for element in docContent.find_all('div'): try: if element.attrs['style'] == 'width:550px': article = defaultdict(str) article_link = 'http://www.moneycontrol.com' + element.a['href'] for p in element.find_all('p'): if 'a_10dgry' in p.attrs['class']: article_time = p.contents[0].split('|')[0] article_date = p.contents[0].split('|')[1][:-1] article['link'] = article_link article['time'] = article_time article['date'] = article_date extractor = Extractor(extractor='ArticleExtractor', url=article_link) article['content'] = extractor.getText() article['title'] = BeautifulSoup(extractor.getHTML(), 'html.parser').find_all('h1')[0].contents[0] articles.append(article) break except: logging.debug('div has no width attribute') return articles
def parse(self, response): hxs = Selector(response) item = ArticleItem() item["title"] = hxs.xpath('//title/text()').extract() item["link"] = response.url item["source"] = hxs.xpath('//p').extract() extractor = Extractor(extractor='ArticleExtractor', url=item["link"]) source = extractor.getHTML() item["text"] = extractor.getText() item["html"] = source page = html.fromstring(source) links = page.xpath("//p//a/@href") linkPattern = re.compile("^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+))?$") for link in links: if linkPattern.match(link) and not link in self.crawled_links: self.crawled_links.append(link) yield Request(link, self.parse) yield item
def get_text(url): from boilerpipe.extract import Extractor try : extractor = Extractor(extractor='DefaultExtractor', url=url) return extractor.getText(), extractor.getHTML() except: return "",""
def traverse(outlet_url, num=5000): retrieved = 0 page = 1 while retrieved < num: current_url = "{}?page={}".format(outlet_url, page) #print(current_url) extractor = Extractor(extractor='KeepEverythingExtractor', url=current_url) html = extractor.getHTML() #print(html) link1 = re.findall(r"<A\shref=\"/artikel/(\d{4}/\d{2}/.*?)\"", html) #print(link1) links = set(link1) #print(links) for link in links: try: art_url = 'https://jungle.world/artikel/' + link #print(art_url) extr = Extractor(extractor='ArticleExtractor', url=art_url) text = extr.getText() #print(text) retrieved += 1 write_to_files(text, art_url, retrieved) print("Extracted {}: {}".format(retrieved, art_url)) except Exception as e: print(e) page += 1
def get_contenthtml_by_html(html): """ 调用开源的正文抽取工具boilerpipe,实现新闻类、论坛类、政府网站类 网页的正文抽取的工具函数(推荐) :param url:待解析网页源码 :returns:str— —正文信息得到突出的网页源码 """ extractor = Extractor(extractor='ArticleExtractor', html=html) highlighted_html = extractor.getHTML() return highlighted_html
def get_contenthtml_by_url(url): """ 调用开源的正文抽取工具boilerpipe,实现新闻类、论坛类、政府网站类 网页的正文抽取的工具函数(不推荐、编码识别有时出问题) :param url:待解析网页链接 :returns:str— —正文信息得到突出的网页源码 """ extractor = Extractor(extractor='ArticleExtractor', url=url) highlighted_html = extractor.getHTML() return highlighted_html
def test_boilerpipe(): your_url = "http://stackoverflow.com/questions/9352259/trouble-importing-boilerpipe-in-python" extractor = Extractor(extractor='ArticleExtractor', url=your_url) extracted_html = extractor.getHTML() extracted_text = extractor.getText() print '\nfunction: %s ' % inspect.stack()[0][3] print 'extracted html: %i text: %i' % (len(extracted_html), len(extracted_text)) print '' n.assert_greater(len(extracted_text), min_str_length)
def bp_treatement(input_file, output_file): """ Defines the specific BoilerPipe treatment to perform from the input file to the output file. """ if input_file.read(): input_file.seek(0) extractor = Extractor(extractor="ArticleExtractor", html=input_file.read()) output_file.write(extractor.getHTML()) else: output_file.write(" ")
def category_extract(category_url, category, start_num=0, end_num=100): num = start_num while True: print(num) url = category_url + "?s=" + str(num) extractor = Extractor(extractor='KeepEverythingExtractor', url=url) html = extractor.getHTML() links = re.findall(r"<A\shref=\"(/artikel/.*?)\">", html) for link in links: try: article_url = "https://www.neues-deutschland.de" + link text, metadata = extract_article(article_url) match = re.match(r"/artikel/(.*?)\.html", link) text_file_name = os.path.join("nd_texts", match.group(1)) with open(text_file_name, "w") as textfile: textfile.write(text) with open("metadata_nd.csv", "a") as metafile: author, date, keywords = metadata if date: date = date.split(".") date_str = date[2] + "-" + date[1] + "-" + date[0] else: date_str = "None" line = text_file_name +\ " " +\ article_url +\ " " +\ date_str +\ " " +\ "|" + str(author) + "|" +\ " " +\ category +\ " " +\ "|radically left|" +\ " |" for keyword in keywords: line += keyword + " " if keywords != []: line = line[:-1] line += "|\n" metafile.write(line) except Exception as e: print(e) num += 25 if links == [] or num == end_num: break
def Text_extractor(y, page, team, team_i): """Extract the text of team pages using BoilerPipe.""" upage = urllib.quote_plus(page) url = "http://" + y + ".igem.org/wiki/index.php?title=" + upage extractor = Extractor(extractor='ArticleExtractor', url=url) f = open('results/%s/%s/%s_-_-_CONTENT.html' % (y, team, page.replace('/', '#')), 'w') f.write(extractor.getHTML()) f.close() f = open('results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')), 'w') f.write(extractor.getText()) f.close() path = 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')) # text = text.replace('\\n', '\\\\n') output = '%s\t%s\t%s\t%s\n' % (y, str(teams_id[team_i]), page, path) teams_pages_text_db.write(output)
def parse(self, response): for article in response.xpath('//channel/item'): item = ArticleItem() # Grab the title and the link to the article item ["title"] = article.xpath("title/text()").extract() item ["link"] = article.xpath("link/text()").extract() item ["date"] = article.xpath("pubDate/text()").extract() link = item["link"][0] extractor = Extractor(extractor='ArticleExtractor', url=link) item ["text"] = extractor.getText() item ["html"] = extractor.getHTML() # Grab the source of the page by making another Request yield Request(link,callback = self.parse_link, meta = dict(item = item))
def extract_boilerpipe(self, html): """ Extract an article with Boilerpipe NOTE: This is an optional method as boilerpipe is dependency-heavy and will be potentially cumbersome to run on manta. """ try: from boilerpipe.extract import Extractor except ImportError: return bp_extract = Extractor(html=html) return bp_extract.getHTML()
def extract_content(page_id, ext_id, htmlReturn=False): # htmlReturn=False: by default returns text content if (page_id is None or "") or (ext_id is None or ""): return badrequest() page = Page.get_page(page_id) if page is None: return documentnotfound() extraction = Extraction.get_extraction(ext_id) if extraction is None: return documentnotfound() original_content = page.content if original_content is None or original_content is "": return nocontent() if not jpype.isThreadAttachedToJVM(): jpype.attachThreadToJVM() extractor = Extractor(extractor='DefaultExtractor', html=original_content) if not htmlReturn: bp_content = extractor.getText() else: bp_content = extractor.getHTML() if bp_content is None: nocontent() extraction.update(bp_content=bp_content) return success()
def build_news_article_from_url(source_url, sNLP): """build new article object from source url, if build fail would return None """ try: print('start to scrape from url: ', source_url) # pre-process news by NewsPaper3k and Boilerpipe library article = Article(source_url, keep_article_html=True) article.build() article.nlp() e = Extractor(extractor='DefaultExtractor', html=article.html) article.text = e.getText() article.article_html = e.getHTML() news_article = NewsArticle(article, sNLP) print('success to scrape from url: ', source_url) return news_article except Exception as e: print('fail to scrape from url: ', source_url) print('reason:', e) return None
def detag_html_file(infile, outfile, id): from boilerpipe.extract import Extractor if not USE_BOILERPLATE: return detag_html_file_bs(infile, outfile, id) tempfile = "%s.tmp.html" % ( infile, ) # boilerplate seems to need an html extension try: copyfile(infile, tempfile) extractor = Extractor(extractor='ArticleExtractor', url="file://" + tempfile) os.unlink(tempfile) extracted_text = extractor.getText() extracted_html = extractor.getHTML() soup = BeautifulSoup(extracted_html) output = codecs.open(outfile, encoding='utf-8', mode='w') output.write(u"<DOC>\n<DOCNO>" + unicode(id) + u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n") head = soup.find('head') if head: title_tag = head.find('title') if title_tag and title_tag.string: output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') + u"</TITLE>\n") extract_para(soup, output) output.write(u"</DOC>\n") output.close() except Exception, exc: try: os.unlink(tempfile) except: pass return detag_html_file_bs(infile, outfile, id)
#coding: utf-8 import sys import jieba from boilerpipe.extract import Extractor reload(sys) sys.setdefaultencoding('utf-8') extractor = Extractor( extractor='ArticleExtractor', url="http://news.scut.edu.cn/s/22/t/3/82/0a/info33290.htm") processed_plaintext = extractor.getText() highlighted_html = extractor.getHTML() segList = jieba.cut(processed_plaintext, cut_all=False) print "/".join(segList) print processed_plaintext
from boilerpipe.extract import Extractor import os directoryEntree = r'/home/romaric/PycharmProjects/scrapping/Corpus_detourage/html/' outputDirectory = r'/home/romaric/PycharmProjects/scrapping/BP/' for f in os.listdir(directoryEntree): completeName = os.path.join(outputDirectory, f) fichierEntree = open(directoryEntree + f, "r", encoding="utf8", errors="ignore") fichierSortie = open(outputDirectory + f, "w", encoding="utf8", errors="ignore") extracteur = Extractor(extractor='ArticleExtractor', html=fichierEntree.read()) fichierSortie.write(extracteur.getHTML())
except: continue tree = etree.tostring(document) cleantree = tree.decode("utf8").replace(" ", " ") cleantree = cleantree.replace("\t", " ") # lang id lang = guess_lang_from_data2(cleantree) if len(languages) > 0 and lang not in languages: logging.info("Language of document " + url + ": " + lang + ". Not among searched languages.") else: # If enabled, remove boilerplate HTML if options.boilerpipe: extractor = Extractor(extractor='ArticleExtractor', html=cleantree) deboiled = extractor.getHTML() else: deboiled = cleantree # We compute MD5 on the HTML (either normalized one or after boilerpipe if enabled): if we get duplicate # files we discard them c = hashlib.md5() c.update(deboiled.encode()) # print("hash", c.hexdigest(), url) # checking for duplicate content (duplicates are discarded) if c.hexdigest() in seen_md5: logging.info("Repeated file:\t" + url + "\tfirst occurrence\t" + seen_md5[c.hexdigest()]) pass else: # If enabled get text with Alcazar library
def update_content_by_url(self): from boilerpipe.extract import Extractor extractor = Extractor(extractor='ArticleExtractor', url=self.url) self.content_html = extractor.getHTML() self.content_text = extractor.getText()
def ExtractPolicyTextWithBoilerpipe(policyUrl, extractorType = 'ArticleExtractor', verbose = False, minLinesPerPolicy = 30): if verbose: if policyUrl == '-': print 'ExtractPolicyTextWithBoilerpipe called with policyUrl = {0}. do nothing.'.format(policyUrl) else: print 'extracting policy text from {0} using {1}'.format(policyUrl, extractorType) # trivial return if policyUrl == '-': return (None, None) try: if policyUrl.startswith('http'): extractor = Extractor(extractor=extractorType, url=policyUrl) # the policyUrl may also be a local file path else: contentFile = open(policyUrl, 'r') extractor = Extractor(extractor=extractorType, html=contextFile.read().decode('utf8')) html = extractor.getHTML() text = extractor.getText() if len(text.split(u'\n')) > minLinesPerPolicy: if verbose: print 'OK' text = text.replace(u'\n', u' ') return (text, html) elif len(text) > 0 and len(html) > 0: print 'Policy {1} ignored. Number of paragraphs in extracted policy is less than {0}.'.format(minLinesPerPolicy, policyUrl) return (None, None) else: print 'boilerpipe extracted nothing from {0}'.format(policyUrl) return (None, None) except TypeError as e: print 'TypeError thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e) return (None, None) except socket.error as e: print 'socket.error thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e) return (None, None) except BadStatusLine as e: print 'httplib.BadStatusLine thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e) return (None, None) except IncompleteRead as e: print 'httplib.IncompleteRead thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e) return (None, None) except LookupError as e: print 'LookupError using boilerpipe to extract {0}: {1}'.format(policyUrl, e) return (None, None) except UnicodeDecodeError as e: print 'UnicodeDecodeError using boilerpipe to extract {0}: {1}'.format(policyUrl, e) return (None, None) except ValueError as e: print 'ValueError using boilerpipe to extract {0}: {1}'.format(policyUrl, e) return (None, None) except urllib2.HTTPError as e: print 'HTTPError using boilerpipe to extract {0}: {1}'.format(policyUrl, e) return (None, None) except urllib2.URLError as e: print 'URLError using boilerpipe to extract {0}: {1}'.format(policyUrl, e) return (None, None) except socket.timeout as e: print 'socket.timeout thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e) return (None, None)
mimeFile = open_xz_or_gzip(options.outDir + "/" + lang + "/mime." + options.compression, "w") normHtmlFile = open_xz_or_gzip(options.outDir + "/" + lang + "/normalized_html." + options.compression, "w") plainTextFile = open_xz_or_gzip(options.outDir + "/" + lang + "/plain_text." + options.compression, "w") if options.boilerpipe: deboilFile = open_xz_or_gzip(options.outDir + "/" + lang + "/" + "deboilerplate_html." + options.compression, "w") files_dict[lang] = {"urlFile": urlFile, "encodingFile": encodingFile, "mimeFile": mimeFile, "normHtmlFile": normHtmlFile, "plainTextFile": plainTextFile, "deboilFile": deboilFile} else: if not os.path.exists(options.outDir + "/" + lang + "/" + "deboilerplate_html." + options.compression) and not os.path.islink(options.outDir + "/" + lang + "/" + "deboilerplate_html." + options.compression): os.symlink("normalized_html." + options.compression, options.outDir + "/" + lang + "/" + "deboilerplate_html." + options.compression) files_dict[lang] = {"urlFile": urlFile, "encodingFile": encodingFile, "mimeFile": mimeFile, "normHtmlFile": normHtmlFile, "plainTextFile": plainTextFile} # If enabled, remove boilerplate HTML if options.boilerpipe: logging.info(url + ": deboiling html") extractor = ExtrB(extractor='ArticleExtractor', html=text) deboiled = str(extractor.getHTML()) else: deboiled = text # We compute a hash on the HTML (either normalized one or after boilerpipe if enabled): # if we get duplicate files we discard them html_hash = mmh3.hash(deboiled, signed=False) # checking for duplicate content (duplicates are discarded) if html_hash in seen_html: logging.info("Repeated file:\t" + url) continue # get text with Alcazar library if options.parser == "alcazar": logging.info(url + ": Getting text with Alcazar") btext = alcazar.bodytext.parse_article(deboiled)
def get_basic(): url = request.args.get('url') extractor = Extractor(extractor='ArticleExtractor', url=url) return extractor.getHTML()
# -*- coding: utf-8 -*- """ @Time: 2019/6/26 14:33 """ import time import os import sys from boilerpipe.extract import Extractor url = "http://www.sohu.com/a/299667318_501931" extractor = Extractor(extractor='ArticleExtractor', url=url) # extractor = Extractor(extractor='ArticleExtractor', html=html) # extractor = Extractor(url=url) extracted_text = extractor.getText() extracted_html = extractor.getHTML() print(extracted_html)
html = urllib.urlopen(eachurl).read() content = Document(html).summary() title = Document(html).short_title() except: print 'Failed URl %s' % eachurl content = '_' title = '_' body_score[-1].append(fscore(word_tokenize(content), data)) title_score[-1].append(fscore(word_tokenize(title), title_true)) ############################################################################################ print 'Boilerpipe...' try: article = Extractor(url=eachurl) title = '_' #title = article.getTitle() content = article.getHTML() except: print 'Failed URl %s' % eachurl content = '_' title = '_' body_score[-1].append(fscore(word_tokenize(content), data)) title_score[-1].append(fscore(word_tokenize(title), title_true)) ###################################################################################### print 'libextract...' # html = urllib.urlopen(eachurl).read() textnodes = list(extract(html)) try: content = ' '.join(each.text_content() for each in textnodes[:5]) except: print 'Not combining unicode %s' % eachurl content = '_'
from boilerpipe.extract import Extractor import argparse import re parser = argparse.ArgumentParser(description = 'Text content extractor') parser.add_argument('--urls', type=str, required=True, help='A new-line separated list of URLS to extract content from') parser.add_argument('--outputdirhtml', type=str, required=True, help='The text output dir to store the content from each URL') parser.add_argument('--outputdirtxt', type=str, required=True, help='The HTML output dir to store the content from each URL') args = parser.parse_args() with open(args.urls, 'r') as url_file: for url in url_file: url = url.strip() url_base_filename = re.sub('[^a-zA-Z0-9_\.\-]', '_', url) url_txt_filename = url_base_filename + ".txt" url_html_filename = url_base_filename + ".html" extractor = Extractor(extractor='ArticleExtractor', url=url) url_txt_contents = extractor.getText() url_html_contents = extractor.getHTML() with open(args.outputdirtxt + '/' + url_txt_filename, 'w') as txt_content_file: txt_content_file.write(url_txt_contents) with open(args.outputdirhtml + '/' + url_html_filename, 'w') as html_content_file: html_content_file.write(url_html_contents)
cleanhtml = cleaner.clean_html( re.sub(r'encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE)) document = html5lib.parse(ftfy.fix_text(cleanhtml), treebuilder="lxml", namespaceHTMLElements=False) tree = etree.tostring(document) cleantree = tree.decode("utf8") cleantree = cleantree.replace("\t", " ") file = open(options.normhtml, "w") file.write(cleantree) file.close() extractor = Extractor(extractor='ArticleExtractor', html=cleantree) extracted_text = extractor.getHTML() file = open(options.deboiled, "w") file.write(extracted_text) file.close() deboiledFile = open(options.deboiled, "r") html = deboiledFile.read() deboiledFile.close() # get text if options.alcazar: text = alcazar.bodytext.parse_article(cleantree) if text.body_text: text = text.body_text else: text = ""
elif obj =='num' and args.result !="": query += source[obj] query += args.result elif obj =='lang' and args.lang !="": query += source[obj] query += args.lang #elif obj =='sortby' and args.sortby !="": # query += source[obj] # query += args.sortby else: query += source[obj] query += source[obj+'_def'] #retrieve HTML page of the URL source try: extractor = Extractor(extractor='ArticleExtractor', url=query) extracted_html = extractor.getHTML() except: e = sys.exc_info()[0] print("\n***ERROR (in main.py, extractor 1): "+str(e)) # sleep for 4 seconds before trying crawling agian, otherwise you will be identified and blocked time.sleep(4) continue #retrieve URLs from the HTML page doc = lxml.html.document_fromstring(extracted_html) urlList = list() for url in doc.xpath(XPATH): url_tmp = str(url.attrib.get('href')) if not 'http' in url_tmp: url_tmp = source['url']+url_tmp urlList.append(url_tmp)
html = urllib.urlopen(eachurl).read() content = Document(html).summary() title = Document(html).short_title() except: print 'Failed URl %s' %eachurl content = '_' title = '_' body_score[-1].append(fscore(word_tokenize(content), data)) title_score[-1].append(fscore(word_tokenize(title), title_true)) ############################################################################################ print 'Boilerpipe...' try: article = Extractor(url=eachurl) title = '_' #title = article.getTitle() content = article.getHTML() except: print 'Failed URl %s' %eachurl content = '_' title = '_' body_score[-1].append(fscore(word_tokenize(content), data)) title_score[-1].append(fscore(word_tokenize(title), title_true)) ###################################################################################### print 'libextract...' # html = urllib.urlopen(eachurl).read() textnodes = list(extract(html)) try: content = ' '.join(each.text_content() for each in textnodes[:5]) except: print 'Not combining unicode %s' %eachurl content = '_'