Exemple #1
0
    def extractHTML(self, url=None, html=None, extractor='ArticleExtractor'):
        cherrypy.response.headers['Content-Type'] = "text/json"

        if url:
            extractor = Extractor(extractor=extractor, url=url)
            extracted_html = extractor.getHTML()
            return json.dumps({'url': url, 'extractedHTML': extracted_html})
        elif html:
            extractor = Extractor(extractor=extractor, url=url)
            extracted_html = extractor.getHTML()
            return json.dumps({
                'html': html[:15],
                'extractedHTML': extracted_html
            })
Exemple #2
0
def traverse(outlet_url, num=3000):
    retrieved = 0
    page = 1
    while retrieved < num:

        if page < 2:
            page_url = outlet_url
        else:
            page_url = "{}page/{}".format(outlet_url, page)

        extractor = Extractor(extractor='KeepEverythingExtractor',
                              url=page_url)
        html = extractor.getHTML()
        links = re.findall(r"<A\shref=\".*de\/(\d{4}\/\d{2}\/\d{2}\/.*)\" rel",
                           html)

        for link in links:
            try:
                art_url = outlet_url + link
                extr = Extractor(extractor='ArticleExtractor', url=art_url)
                text = extr.getText()
                retrieved += 1
                write_to_files(text, art_url, retrieved)
                print("Extracted {}: {}".format(retrieved, art_url))

            except Exception as e:
                print(e)
        page += 1
def detag_html_file(infile, outfile, id):
    from boilerpipe.extract import Extractor

    if not USE_BOILERPLATE:
        return detag_html_file_bs(infile, outfile, id)

    tempfile = "%s.tmp.html" % (infile,) # boilerplate seems to need an html extension
    try:
        copyfile(infile, tempfile)
        extractor = Extractor(extractor='ArticleExtractor', url="file://"+tempfile)
        os.unlink(tempfile)

        extracted_text = extractor.getText()
        extracted_html = extractor.getHTML()

        soup = BeautifulSoup(extracted_html)
        output = codecs.open(outfile, encoding='utf-8', mode='w')
        output.write(u"<DOC>\n<DOCNO>" + unicode(id) + u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n");
        head = soup.find('head')
        if head:
            title_tag = head.find('title')
            if title_tag and title_tag.string:
                output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') + u"</TITLE>\n")

        extract_para(soup, output)
        output.write(u"</DOC>\n")
        output.close()
    except Exception, exc:
        try:
            os.unlink(tempfile)
        except:
            pass

        return detag_html_file_bs(infile, outfile, id)
Exemple #4
0
def Text_extractor(y, page, team, team_i, counter=0):
    """Extract the text of team pages using BoilerPipe."""
    try:
        upage = urllib.parse.quote_plus(page)
        url = "http://" + y + ".igem.org/wiki/index.php?title=" + upage
        extractor = Extractor(extractor='ArticleExtractor', url=url)
    except:
        counter += 1
        if counter > 10:
            print("Failed to get the text for page {}".format(page))
            return None
        Text_extractor(y, page, team, team_i, counter=counter)
    f = open(
        'results/%s/%s/%s_-_-_CONTENT.html' %
        (y, team, page.replace('/', '#')), 'w')
    f.write(extractor.getHTML())
    f.close()
    f = open(
        'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')),
        'w')
    f.write(extractor.getText())
    f.close()
    path = 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#'))
    # text = text.replace('\\n', '\\\\n')
    output = '%s\t%s\t%s\t%s\n' % (y, str(teams_id[team_i]), page, path)
    teams_pages_text_db.write(output)
Exemple #5
0
def get_articles(url):
    doc = urllib.request.urlopen(url)
    docContent = BeautifulSoup(doc, 'html.parser')
    articles = []
    for element in docContent.find_all('div'):
        try:
            if element.attrs['style'] == 'width:550px':
                article = defaultdict(str)
                article_link = 'http://www.moneycontrol.com' + element.a['href']
                for p in element.find_all('p'):
                    if 'a_10dgry' in p.attrs['class']:
                        article_time = p.contents[0].split('|')[0]
                        article_date = p.contents[0].split('|')[1][:-1]
                        article['link'] = article_link
                        article['time'] = article_time
                        article['date'] = article_date
                        extractor = Extractor(extractor='ArticleExtractor',
                                              url=article_link)
                        article['content'] = extractor.getText()
                        article['title'] = BeautifulSoup(extractor.getHTML(),
                                                         'html.parser').find_all('h1')[0].contents[0]
                        articles.append(article)
                        break
        except:
            logging.debug('div has no width attribute')
    return articles
    def parse(self, response):
        hxs = Selector(response)
        
        item = ArticleItem()
        item["title"] = hxs.xpath('//title/text()').extract()
        item["link"] = response.url
        item["source"] = hxs.xpath('//p').extract()
        
        extractor = Extractor(extractor='ArticleExtractor', url=item["link"])
        
        source = extractor.getHTML()
        item["text"] = extractor.getText()
        item["html"] = source
        
        page = html.fromstring(source)
        links = page.xpath("//p//a/@href")

        linkPattern = re.compile("^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?\+=&amp;%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&amp;%@!\-\/\(\)]+))?$")
        
        for link in links:
            if linkPattern.match(link) and not link in self.crawled_links:
                self.crawled_links.append(link)
                yield Request(link, self.parse)
        

        yield item
def get_text(url):
    from boilerpipe.extract import Extractor
    try :
        extractor = Extractor(extractor='DefaultExtractor', url=url)
        return extractor.getText(), extractor.getHTML()
    except:
        return "",""
Exemple #8
0
def traverse(outlet_url, num=5000):
    retrieved = 0
    page = 1
    while retrieved < num:
        current_url = "{}?page={}".format(outlet_url, page)
        #print(current_url)
        extractor = Extractor(extractor='KeepEverythingExtractor',
                              url=current_url)
        html = extractor.getHTML()
        #print(html)
        link1 = re.findall(r"<A\shref=\"/artikel/(\d{4}/\d{2}/.*?)\"", html)
        #print(link1)
        links = set(link1)
        #print(links)
        for link in links:
            try:
                art_url = 'https://jungle.world/artikel/' + link
                #print(art_url)
                extr = Extractor(extractor='ArticleExtractor', url=art_url)
                text = extr.getText()
                #print(text)
                retrieved += 1
                write_to_files(text, art_url, retrieved)
                print("Extracted {}: {}".format(retrieved, art_url))
            except Exception as e:
                print(e)
        page += 1
Exemple #9
0
def get_contenthtml_by_html(html):
    """
        调用开源的正文抽取工具boilerpipe,实现新闻类、论坛类、政府网站类
            网页的正文抽取的工具函数(推荐)
        :param url:待解析网页源码
        :returns:str— —正文信息得到突出的网页源码
    """
    extractor = Extractor(extractor='ArticleExtractor', html=html)
    highlighted_html = extractor.getHTML()
    return highlighted_html
Exemple #10
0
def get_contenthtml_by_url(url):
    """
        调用开源的正文抽取工具boilerpipe,实现新闻类、论坛类、政府网站类
            网页的正文抽取的工具函数(不推荐、编码识别有时出问题)
        :param url:待解析网页链接
        :returns:str— —正文信息得到突出的网页源码
    """
    extractor = Extractor(extractor='ArticleExtractor', url=url)
    highlighted_html = extractor.getHTML()
    return highlighted_html
def test_boilerpipe():
    your_url = "http://stackoverflow.com/questions/9352259/trouble-importing-boilerpipe-in-python"
    extractor = Extractor(extractor='ArticleExtractor', url=your_url)
    extracted_html = extractor.getHTML()
    extracted_text = extractor.getText()

    print '\nfunction: %s ' % inspect.stack()[0][3]
    print 'extracted  html: %i text: %i' % (len(extracted_html), len(extracted_text))
    print ''
    n.assert_greater(len(extracted_text), min_str_length)
Exemple #12
0
def bp_treatement(input_file, output_file):
    """
    Defines the specific BoilerPipe treatment to perform from the input file to the output file.
    """
    if input_file.read():
        input_file.seek(0)
        extractor = Extractor(extractor="ArticleExtractor", html=input_file.read())
        output_file.write(extractor.getHTML())
    else:
        output_file.write(" ")
Exemple #13
0
def category_extract(category_url, category, start_num=0, end_num=100):
    num = start_num

    while True:
        print(num)

        url = category_url + "?s=" + str(num)
        extractor = Extractor(extractor='KeepEverythingExtractor', url=url)
        html = extractor.getHTML()
        links = re.findall(r"<A\shref=\"(/artikel/.*?)\">", html)
        for link in links:
            try:
                article_url = "https://www.neues-deutschland.de" + link
                text, metadata = extract_article(article_url)
                match = re.match(r"/artikel/(.*?)\.html", link)
                text_file_name = os.path.join("nd_texts", match.group(1))
                with open(text_file_name, "w") as textfile:
                    textfile.write(text)
                with open("metadata_nd.csv", "a") as metafile:
                    author, date, keywords = metadata
                    if date:
                        date = date.split(".")
                        date_str = date[2] + "-" + date[1] + "-" + date[0]
                    else:
                        date_str = "None"
                    line = text_file_name +\
                        " " +\
                        article_url +\
                        " " +\
                        date_str +\
                        " " +\
                        "|" + str(author) + "|" +\
                        " " +\
                        category +\
                        " " +\
                        "|radically left|" +\
                        " |"
                    for keyword in keywords:
                        line += keyword + " "
                    if keywords != []:
                        line = line[:-1]
                    line += "|\n"
                    metafile.write(line)
            except Exception as e:
                print(e)

        num += 25

        if links == [] or num == end_num:
            break
Exemple #14
0
def Text_extractor(y, page, team, team_i):
    """Extract the text of team pages using BoilerPipe."""
    upage = urllib.quote_plus(page)
    url = "http://" + y + ".igem.org/wiki/index.php?title=" + upage
    extractor = Extractor(extractor='ArticleExtractor', url=url)
    f = open('results/%s/%s/%s_-_-_CONTENT.html' % (y, team, page.replace('/', '#')), 'w')
    f.write(extractor.getHTML())
    f.close()
    f = open('results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')), 'w')
    f.write(extractor.getText())
    f.close()
    path = 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#'))
    # text = text.replace('\\n', '\\\\n')
    output = '%s\t%s\t%s\t%s\n' % (y, str(teams_id[team_i]), page, path)
    teams_pages_text_db.write(output)
 def parse(self, response):
     for article in response.xpath('//channel/item'):
         item = ArticleItem()
         # Grab the title and the link to the article
         item ["title"] = article.xpath("title/text()").extract()
         item ["link"] = article.xpath("link/text()").extract()
         item ["date"] = article.xpath("pubDate/text()").extract()
         
         link = item["link"][0]
         
         extractor = Extractor(extractor='ArticleExtractor', 
                 url=link)
         item ["text"] = extractor.getText()
         item ["html"] = extractor.getHTML()
         # Grab the source of the page by making another Request
         yield Request(link,callback = self.parse_link, meta = dict(item = item))
  def extract_boilerpipe(self, html):
    """ 
    Extract an article with Boilerpipe 
    
    NOTE: This is an optional method as 
    boilerpipe is dependency-heavy and
    will be potentially cumbersome 
    to run on manta.
    """
    try:
      from boilerpipe.extract import Extractor
    except ImportError:
      return 

    bp_extract = Extractor(html=html)
    return bp_extract.getHTML()
 def extract_content(page_id, ext_id, htmlReturn=False): # htmlReturn=False: by default returns text content
     if (page_id is None or "") or (ext_id is None or ""): return badrequest()
     page = Page.get_page(page_id)
     if page is None: return documentnotfound()
     extraction = Extraction.get_extraction(ext_id)
     if extraction is None: return documentnotfound()
     original_content = page.content
     if original_content is None or original_content is "": return nocontent()
     
     if not jpype.isThreadAttachedToJVM():
         jpype.attachThreadToJVM()
     extractor = Extractor(extractor='DefaultExtractor', html=original_content)
     if not htmlReturn:
         bp_content = extractor.getText()
     else:
         bp_content = extractor.getHTML()
     if bp_content is None: nocontent()
     
     extraction.update(bp_content=bp_content)
     return success()
Exemple #18
0
    def build_news_article_from_url(source_url, sNLP):
        """build new article object from source url, if build fail would return None
        """
        try:
            print('start to scrape from url: ', source_url)

            # pre-process news by NewsPaper3k and Boilerpipe library
            article = Article(source_url, keep_article_html=True)
            article.build()
            article.nlp()
            e = Extractor(extractor='DefaultExtractor', html=article.html)
            article.text = e.getText()
            article.article_html = e.getHTML()

            news_article = NewsArticle(article, sNLP)
            print('success to scrape from url: ', source_url)
            return news_article
        except Exception as e:
            print('fail to scrape from url: ', source_url)
            print('reason:', e)
            return None
def detag_html_file(infile, outfile, id):
    from boilerpipe.extract import Extractor

    if not USE_BOILERPLATE:
        return detag_html_file_bs(infile, outfile, id)

    tempfile = "%s.tmp.html" % (
        infile, )  # boilerplate seems to need an html extension
    try:
        copyfile(infile, tempfile)
        extractor = Extractor(extractor='ArticleExtractor',
                              url="file://" + tempfile)
        os.unlink(tempfile)

        extracted_text = extractor.getText()
        extracted_html = extractor.getHTML()

        soup = BeautifulSoup(extracted_html)
        output = codecs.open(outfile, encoding='utf-8', mode='w')
        output.write(u"<DOC>\n<DOCNO>" + unicode(id) +
                     u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n")
        head = soup.find('head')
        if head:
            title_tag = head.find('title')
            if title_tag and title_tag.string:
                output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') +
                             u"</TITLE>\n")

        extract_para(soup, output)
        output.write(u"</DOC>\n")
        output.close()
    except Exception, exc:
        try:
            os.unlink(tempfile)
        except:
            pass

        return detag_html_file_bs(infile, outfile, id)
Exemple #20
0
#coding: utf-8
import sys
import jieba
from boilerpipe.extract import Extractor
reload(sys)
sys.setdefaultencoding('utf-8')
extractor = Extractor(
    extractor='ArticleExtractor',
    url="http://news.scut.edu.cn/s/22/t/3/82/0a/info33290.htm")
processed_plaintext = extractor.getText()
highlighted_html = extractor.getHTML()
segList = jieba.cut(processed_plaintext, cut_all=False)
print "/".join(segList)
print processed_plaintext
Exemple #21
0
from boilerpipe.extract import Extractor
import os

directoryEntree = r'/home/romaric/PycharmProjects/scrapping/Corpus_detourage/html/'
outputDirectory = r'/home/romaric/PycharmProjects/scrapping/BP/'

for f in os.listdir(directoryEntree):

    completeName = os.path.join(outputDirectory, f)
    fichierEntree = open(directoryEntree + f,
                         "r",
                         encoding="utf8",
                         errors="ignore")
    fichierSortie = open(outputDirectory + f,
                         "w",
                         encoding="utf8",
                         errors="ignore")
    extracteur = Extractor(extractor='ArticleExtractor',
                           html=fichierEntree.read())
    fichierSortie.write(extracteur.getHTML())
Exemple #22
0
        except:
            continue

        tree = etree.tostring(document)
        cleantree = tree.decode("utf8").replace("&#160;", " ")
        cleantree = cleantree.replace("\t", " ")

        # lang id
        lang = guess_lang_from_data2(cleantree)
        if len(languages) > 0 and lang not in languages:
            logging.info("Language of document " + url + ": " + lang + ". Not among searched languages.")
        else:
            # If enabled, remove boilerplate HTML
            if options.boilerpipe:
                extractor = Extractor(extractor='ArticleExtractor', html=cleantree)
                deboiled = extractor.getHTML()
            else:
                deboiled = cleantree

            # We compute MD5 on the HTML (either normalized one or after boilerpipe if enabled): if we get duplicate
            # files we discard them
            c = hashlib.md5()
            c.update(deboiled.encode())
            # print("hash", c.hexdigest(), url)

            # checking for duplicate content (duplicates are discarded)
            if c.hexdigest() in seen_md5:
                logging.info("Repeated file:\t" + url + "\tfirst occurrence\t" + seen_md5[c.hexdigest()])
                pass
            else:
                # If enabled get text with Alcazar library
Exemple #23
0
 def update_content_by_url(self):
     from boilerpipe.extract import Extractor
     extractor = Extractor(extractor='ArticleExtractor', url=self.url)
     self.content_html = extractor.getHTML()
     self.content_text = extractor.getText()
def ExtractPolicyTextWithBoilerpipe(policyUrl, extractorType = 'ArticleExtractor', verbose = False, minLinesPerPolicy = 30):
  if verbose:
    if policyUrl == '-':
      print 'ExtractPolicyTextWithBoilerpipe called with policyUrl = {0}. do nothing.'.format(policyUrl)
    else:
      print 'extracting policy text from {0} using {1}'.format(policyUrl, extractorType)

  # trivial return
  if policyUrl == '-':
    return (None, None)
  
  try:
    if policyUrl.startswith('http'):
      extractor = Extractor(extractor=extractorType, url=policyUrl)
        
    # the policyUrl may also be a local file path
    else:
      contentFile = open(policyUrl, 'r')
      extractor = Extractor(extractor=extractorType, html=contextFile.read().decode('utf8'))
    html = extractor.getHTML()
    text = extractor.getText()
    
    if len(text.split(u'\n')) > minLinesPerPolicy:
      if verbose:
        print 'OK'
      text = text.replace(u'\n', u'  ')
      return (text, html)
    elif len(text) > 0 and len(html) > 0:
      print 'Policy {1} ignored. Number of paragraphs in extracted policy is less than {0}.'.format(minLinesPerPolicy, policyUrl)
      return (None, None)
    else:
      print 'boilerpipe extracted nothing from {0}'.format(policyUrl)
      return (None, None)
  except TypeError as e:
    print 'TypeError thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
    return (None, None)
  except socket.error as e:
    print 'socket.error thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
    return (None, None)
  except BadStatusLine as e:
    print 'httplib.BadStatusLine thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
    return (None, None)
  except IncompleteRead as e:
    print 'httplib.IncompleteRead thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
    return (None, None)
  except LookupError as e:
    print 'LookupError using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
    return (None, None)
  except UnicodeDecodeError as e:
    print 'UnicodeDecodeError using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
    return (None, None)
  except ValueError as e:
    print 'ValueError using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
    return (None, None)
  except urllib2.HTTPError as e:
    print 'HTTPError using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
    return (None, None)
  except urllib2.URLError as e:
    print 'URLError using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
    return (None, None)
  except socket.timeout as e:
    print 'socket.timeout thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
    return (None, None)
Exemple #25
0
 def update_content_by_url(self):
     from boilerpipe.extract import Extractor
     extractor = Extractor(extractor='ArticleExtractor', url=self.url)
     self.content_html = extractor.getHTML()
     self.content_text = extractor.getText()
Exemple #26
0
        mimeFile = open_xz_or_gzip(options.outDir + "/" + lang + "/mime." + options.compression, "w")
        normHtmlFile = open_xz_or_gzip(options.outDir + "/" + lang + "/normalized_html." + options.compression, "w")
        plainTextFile = open_xz_or_gzip(options.outDir + "/" + lang + "/plain_text." + options.compression, "w")
        if options.boilerpipe:
            deboilFile = open_xz_or_gzip(options.outDir + "/" + lang + "/" + "deboilerplate_html." + options.compression, "w")
            files_dict[lang] = {"urlFile": urlFile, "encodingFile": encodingFile, "mimeFile": mimeFile, "normHtmlFile": normHtmlFile, "plainTextFile": plainTextFile, "deboilFile": deboilFile}
        else:
            if not os.path.exists(options.outDir + "/" + lang + "/" + "deboilerplate_html." + options.compression) and not os.path.islink(options.outDir + "/" + lang + "/" + "deboilerplate_html." + options.compression):
                os.symlink("normalized_html." + options.compression, options.outDir + "/" + lang + "/" + "deboilerplate_html." + options.compression)
            files_dict[lang] = {"urlFile": urlFile, "encodingFile": encodingFile, "mimeFile": mimeFile, "normHtmlFile": normHtmlFile, "plainTextFile": plainTextFile}
    
    # If enabled, remove boilerplate HTML
    if options.boilerpipe:
        logging.info(url + ": deboiling html")
        extractor = ExtrB(extractor='ArticleExtractor', html=text)
        deboiled = str(extractor.getHTML())
    else:
        deboiled = text

    # We compute a hash on the HTML (either normalized one or after boilerpipe if enabled):
    # if we get duplicate files we discard them
    html_hash = mmh3.hash(deboiled, signed=False)
    # checking for duplicate content (duplicates are discarded)
    if html_hash in seen_html:
        logging.info("Repeated file:\t" + url)
        continue

    # get text with Alcazar library
    if options.parser == "alcazar":
        logging.info(url + ": Getting text with Alcazar")
        btext = alcazar.bodytext.parse_article(deboiled)
Exemple #27
0
def get_basic():
    url = request.args.get('url')
    extractor = Extractor(extractor='ArticleExtractor', url=url)
    return extractor.getHTML()
# -*- coding: utf-8 -*-
"""
 @Time: 2019/6/26 14:33
"""

import time
import os
import sys

from boilerpipe.extract import Extractor

url = "http://www.sohu.com/a/299667318_501931"
extractor = Extractor(extractor='ArticleExtractor', url=url)

# extractor = Extractor(extractor='ArticleExtractor', html=html)

# extractor = Extractor(url=url)

extracted_text = extractor.getText()

extracted_html = extractor.getHTML()

print(extracted_html)
Exemple #29
0
     html = urllib.urlopen(eachurl).read()
     content = Document(html).summary()
     title = Document(html).short_title()
 except:
     print 'Failed URl %s' % eachurl
     content = '_'
     title = '_'
 body_score[-1].append(fscore(word_tokenize(content), data))
 title_score[-1].append(fscore(word_tokenize(title), title_true))
 ############################################################################################
 print 'Boilerpipe...'
 try:
     article = Extractor(url=eachurl)
     title = '_'
     #title = article.getTitle()
     content = article.getHTML()
 except:
     print 'Failed URl %s' % eachurl
     content = '_'
     title = '_'
 body_score[-1].append(fscore(word_tokenize(content), data))
 title_score[-1].append(fscore(word_tokenize(title), title_true))
 ######################################################################################
 print 'libextract...'
 # html = urllib.urlopen(eachurl).read()
 textnodes = list(extract(html))
 try:
     content = ' '.join(each.text_content() for each in textnodes[:5])
 except:
     print 'Not combining unicode %s' % eachurl
     content = '_'
Exemple #30
0
from boilerpipe.extract import Extractor
import argparse
import re

parser = argparse.ArgumentParser(description = 'Text content extractor')
parser.add_argument('--urls', type=str, required=True,
    help='A new-line separated list of URLS to extract content from')
parser.add_argument('--outputdirhtml', type=str, required=True,
    help='The text output dir to store the content from each URL')
parser.add_argument('--outputdirtxt', type=str, required=True,
    help='The HTML output dir to store the content from each URL')
args = parser.parse_args()

with open(args.urls, 'r') as url_file:
    for url in url_file:
        url = url.strip()
        url_base_filename = re.sub('[^a-zA-Z0-9_\.\-]', '_', url)
        url_txt_filename = url_base_filename + ".txt"
        url_html_filename = url_base_filename + ".html"
        extractor = Extractor(extractor='ArticleExtractor', url=url)
        url_txt_contents = extractor.getText()
        url_html_contents = extractor.getHTML()
        with open(args.outputdirtxt + '/' + url_txt_filename, 'w') as txt_content_file:
            txt_content_file.write(url_txt_contents)
        with open(args.outputdirhtml + '/' + url_html_filename, 'w') as html_content_file:
            html_content_file.write(url_html_contents)

        cleanhtml = cleaner.clean_html(
            re.sub(r'encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE))
        document = html5lib.parse(ftfy.fix_text(cleanhtml),
                                  treebuilder="lxml",
                                  namespaceHTMLElements=False)
        tree = etree.tostring(document)
        cleantree = tree.decode("utf8")
        cleantree = cleantree.replace("\t", " ")

        file = open(options.normhtml, "w")
        file.write(cleantree)
        file.close()

        extractor = Extractor(extractor='ArticleExtractor', html=cleantree)
        extracted_text = extractor.getHTML()
        file = open(options.deboiled, "w")
        file.write(extracted_text)
        file.close()

        deboiledFile = open(options.deboiled, "r")
        html = deboiledFile.read()
        deboiledFile.close()

        # get text
        if options.alcazar:
            text = alcazar.bodytext.parse_article(cleantree)
            if text.body_text:
                text = text.body_text
            else:
                text = ""
Exemple #32
0
				elif obj =='num' and args.result !="":
					query += source[obj]
					query += args.result
				elif obj =='lang' and args.lang !="":
					query += source[obj]
					query += args.lang
				#elif obj =='sortby' and args.sortby !="":
				#	query += source[obj]
				#	query += args.sortby 
				else:
					query += source[obj]
					query += source[obj+'_def']
			#retrieve HTML page of the URL source
			try:
				extractor		= Extractor(extractor='ArticleExtractor', url=query)
				extracted_html	= extractor.getHTML()
			except:
				e = sys.exc_info()[0]
				print("\n***ERROR (in main.py, extractor 1): "+str(e))
				# sleep for 4 seconds before trying crawling agian, otherwise you will be identified and blocked
				time.sleep(4)
				continue

			#retrieve URLs from the HTML page
			doc	= lxml.html.document_fromstring(extracted_html)
			urlList = list()
			for url in doc.xpath(XPATH):
				url_tmp = str(url.attrib.get('href'))
				if not 'http' in url_tmp:
					url_tmp = source['url']+url_tmp
				urlList.append(url_tmp)
Exemple #33
0
		html = urllib.urlopen(eachurl).read()
		content = Document(html).summary()
		title = Document(html).short_title()
	except:
		print 'Failed URl %s' %eachurl
		content = '_'
		title = '_'
	body_score[-1].append(fscore(word_tokenize(content), data))
	title_score[-1].append(fscore(word_tokenize(title), title_true))
	############################################################################################
	print 'Boilerpipe...'
	try:			
		article = Extractor(url=eachurl)
		title = '_'
		#title = article.getTitle()
		content = article.getHTML()
	except:
		print 'Failed URl %s' %eachurl
		content = '_'
		title = '_'
	body_score[-1].append(fscore(word_tokenize(content), data))
	title_score[-1].append(fscore(word_tokenize(title), title_true))
	######################################################################################
	print 'libextract...'
	# html = urllib.urlopen(eachurl).read()
	textnodes = list(extract(html))
	try:
		content = ' '.join(each.text_content() for each in textnodes[:5])
	except:
		print 'Not combining unicode %s' %eachurl
		content = '_'