예제 #1
0
def traverse(outlet_url, num=5000):
    retrieved = 0
    page = 1
    while retrieved < num:
        current_url = "{}?page={}".format(outlet_url, page)
        #print(current_url)
        extractor = Extractor(extractor='KeepEverythingExtractor',
                              url=current_url)
        html = extractor.getHTML()
        #print(html)
        link1 = re.findall(r"<A\shref=\"/artikel/(\d{4}/\d{2}/.*?)\"", html)
        #print(link1)
        links = set(link1)
        #print(links)
        for link in links:
            try:
                art_url = 'https://jungle.world/artikel/' + link
                #print(art_url)
                extr = Extractor(extractor='ArticleExtractor', url=art_url)
                text = extr.getText()
                #print(text)
                retrieved += 1
                write_to_files(text, art_url, retrieved)
                print("Extracted {}: {}".format(retrieved, art_url))
            except Exception as e:
                print(e)
        page += 1
예제 #2
0
def traverse(outlet_url, num=3000):
    retrieved = 0
    page = 1
    while retrieved < num:

        if page < 2:
            page_url = outlet_url
        else:
            page_url = "{}page/{}".format(outlet_url, page)

        extractor = Extractor(extractor='KeepEverythingExtractor',
                              url=page_url)
        html = extractor.getHTML()
        links = re.findall(r"<A\shref=\".*de\/(\d{4}\/\d{2}\/\d{2}\/.*)\" rel",
                           html)

        for link in links:
            try:
                art_url = outlet_url + link
                extr = Extractor(extractor='ArticleExtractor', url=art_url)
                text = extr.getText()
                retrieved += 1
                write_to_files(text, art_url, retrieved)
                print("Extracted {}: {}".format(retrieved, art_url))

            except Exception as e:
                print(e)
        page += 1
예제 #3
0
def extract(args):
    if not os.path.isfile("articles.json"):
        print "File articles.json does not exist"
        print "Have you already crawled?"
        exit()

    with open("articles.json") as article_list:
        articles = [
            json.loads(line) for line in article_list.read().splitlines()
        ]

    for article in articles:
        if args.html:
            with open(article['path'], "rb") as html:
                extractor = Extractor(extractor='ArticleExtractor',
                                      html=html.read())
        else:
            extractor = Extractor(extractor='ArticleExtractor',
                                  url=article['url'])

        dirname = os.path.join("articles", article['domain']) + "/text"
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        filename = sha1(article['url']).hexdigest() + '.txt'
        path = os.path.join(dirname, filename)

        with open(path, "wb+") as extracted_text:
            extracted_text.write(extractor.getText().encode("utf-8"))
예제 #4
0
 def boilerpipe_text(cls,url_in=None,html_in=None,extractor='ArticleExtractor'):
     assert (url_in!=None) != (html_in!=None) # one, not both
     inp=url_in or html_in
     if url_in:
         extractor = Extractor(extractor=extractor, url=inp)
     else:
         extractor = Extractor(extractor=extractor, html=inp)
     return HtmlTextCleaner().spec_text_cleaner(extractor.getText())
예제 #5
0
    def extractHTML(self, url=None, html=None, extractor='ArticleExtractor'):
        cherrypy.response.headers['Content-Type'] = "text/json"

        if url:
            extractor = Extractor(extractor=extractor, url=url)
            extracted_html = extractor.getHTML()
            return json.dumps({'url': url, 'extractedHTML': extracted_html})
        elif html:
            extractor = Extractor(extractor=extractor, url=url)
            extracted_html = extractor.getHTML()
            return json.dumps({
                'html': html[:15],
                'extractedHTML': extracted_html
            })
예제 #6
0
def extract_cleaned_text_from_url(url: str,
                                  proxy: dict = None) -> Union[str, None]:
    """Extracts the main text from an URL using boilerpipe if its an HTML and tika if it is an PDF

    Args:
        url: An url to extract text from
        proxy: Proxy settings for requests

    Returns:
        Extracted text
    """

    if 'pdf' in url:
        tika.TikaClientOnly = True
        extracted = _extract_cleaned_text_from_pdf(url)
    else:
        kwargs = {'url': url}
        if 'promed' in url:
            html = get_html_from_promed_url(url, proxy)
            kwargs = {'html': html}
        try:
            extracted = Extractor(extractor='ArticleExtractor',
                                  **kwargs).getText()
        except Exception as e:
            print(f'{url} caused {e}')
            extracted = None
    return _remove_control_characters(extracted)
예제 #7
0
def extract_and_save(url, path):
    try:
        handle = urllib2.urlopen(url)
        html_content = handle.read()
        extractor = Extractor(extractor='KeepEverythingExtractor',
                              html=html_content)
        text = extractor.getText()
        if text:
            if detect_english(text):
                links = get_all_urls(html_content, url)
                for link in links:
                    try:
                        handle = urllib2.urlopen(url)
                        html_content = handle.read()
                        #extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
                        #text_content = extractor.getText()
                        #if text_content:
                        #	if detect_english(text_content):
                        encoded_url = encode(link)
                        f = open(path + "/" + encoded_url, "w")
                        f.write(html_content)
                        f.close()
                    except:
                        print url
                        traceback.print_exc()
                        return None
    except:
        print url
        traceback.print_exc()
        return None
예제 #8
0
def articles_from_feed():
    articles = []

    feed = feedparser.parse(rss_fakt)
    for item in feed["items"]:
        url = convert_url(item["link"])
        print item["published"]
        print url
        try:
            extractor = Extractor(extractor="ArticleExtractor", url=url)

            date = email.utils.parsedate_tz(item["published"])
            timestamp = email.utils.mktime_tz(date)
            iso = datetime.datetime.utcfromtimestamp(timestamp).isoformat()
            filename = url.split(",")[-1].split(".")[0]

            data = {
                "text": extractor.getText(),
                "date": iso,
                "url": url,
                "filename": filename
            }
        except Exception as e:
            print "Error downloading article from " + url
        articles.append(data)
    return articles
예제 #9
0
 def ParserBoilerEverything(html_object):
     extractor = Extractor(extractor='DefaultExtractor', html=html_object)
     sents = extractor.getText()
     try:
         return sents
     except Exception as e:
         return
예제 #10
0
def extract_metadata(url):
    extractor = Extractor(extractor='KeepEverythingExtractor', url=url)
    text = extractor.getText().split("\n")
    author = None
    date = None
    keywords = []
    find_keywords = False
    for line in text:
        #author
        match = re.match("Von\s(\w+\s\w+)(,\s[\s\w]*$|$)", line)
        if match:
            author = match.group(1)
            continue

        #date
        match = re.match("([0-9]{2}\.[0-9]{2}\.[0-9]{4})$", line)
        if match:
            date = match.group(1)
            continue

        #keywords
        if find_keywords:
            match = re.match("Hat\sIhnen\sdieser\sArtikel\sgefallen.*", line)
            if match:
                find_keywords = False
                continue
            else:
                keywords.append(line) 
            
        match = re.match("Schlagwörter zu diesem Artikel:", line)
        if match:
            find_keywords = True
        
    return author, date, keywords
def extract_article_text(url):
    if url in utils.BROKEN_URLS or any([True for sd in BAD_SUBDOMAINS if sd in url]):
        return ""

    while True:
        try:
            extractor = Extractor(extractor='ArticleExtractor', url=url)
            break
        except socket.timeout:
            print("got socket.timeout on url: {}. retrying...".format(url), file=utils.stddbg)
        except URLError as e:
            if e.reason == "timed out":
                print("got urllib 'timed out' on url {}. retrying...".format(url), file=utils.stddbg)
            elif hasattr(e.reason, "strerror") and e.reason.strerror == 'getaddrinfo failed':
                print("got urllib 'getaddrinfo failed' on url {}. retrying...".format(url), file=utils.stddbg)
            elif e.code == 503:
                print("got urllib 503 error on url {}. retrying...".format(url), file=utils.stddbg)
            else:
                if not hasattr(e, "url"):
                    e.url = url
                raise
        except Exception as e:
            e.url = url
            raise e

    text = str(unicodedata.normalize('NFKD', (str(extractor.getText()))).encode('ascii', 'ignore'))
    return filter_junk(text)
예제 #12
0
def post_index(post):
    extractor = Extractor(extractor='ArticleExtractor', url=post['href'])
    post_text = extractor.getText().replace('\n', ' ')
    url = 'http://localhost:9200/bookmarks/bookmark/%s/_create' % post['hash']
    data = '{"title":"%s", "url":"%s", "text":"%s"}' % (post['description'], post['href'], post_text.replace('"', '\\"'))
    r = requests.put(url, data=data)
    print r.status_code
def reader_processor(csv, cols=cols):
    """
    Takes input of a csv of the cross between
    stats.csv and articles.csv as 'day<date of experiment>.csv'

    entities: Returns the entity list of individual article.
    total: Returns the total of all the entities identified.

    """
    df = pd.read_csv(csv, names=cols)
    total = []
    entites = []
    for i, row in df.iterrows():
        #MINE ONLY SPECIFIC ARTICLES [>=6 (12 seconds)]
        if (row["counter"] >= 6):
            ## FORM A NEW TEXT
            text = row["articleBody"] + " " + row["headline"]
            #### if readmore has been clicked
            if (row["readmore"] == 1):
                try:
                    #use of boilerpipe
                    external = Extractor(extractor='ArticleExtractor',
                                         url=row["source"]).getText()
                except:
                    external = ''
                text = text + " " + external
            entity, w_entity = entity_extractor(text, row["counter"])
            entites.append(entity)
            total.extend(w_entity)

    return entites, total
예제 #14
0
def extract_article_content(html, url):
    """
    Disclaimer
    ----------
    Copied from
    https://github.com/turi-code/how-to/blob/master/
            extract_article_content_from_HTML.py
    Description
    ----------
    Extract the primary textual content from an HTML news article.
    In many cases, the HTML source of news articles is littered with
    boilerplate text that you would not want to include when doing text
    analysis on the content the page. Even if you could write some rules to
    extract the content from one page, it's unlikely that those rules would
    apply to an article from another site. The boilerpipe module allows us to
    solve this problem more generally.
    Parameters
    ----------
    html : str
        The source HTML from which to extract the content.
    url : str
        The url, needed for logging purposes only
    Returns
    -------
    out : str
        The primary content of the page with all HTML and boilerplate text
        removed.
    Examples
    --------
    >>> extract_article_content(
            "<html><body><p>Turi is in the business of building the best " \
            "machine learning platform on the planet. Our goal is to make " \
            "it easy for data scientists to build intelligent, predictive " \
            "applications quickly and at scale. Given the perplexing array " \
            "of tools in this space, we often get asked "Why Turi? What " \
            "differentiates it from tools X, Y, and Z?" This blog post aims " \
            "to provide some answers. I’ll go into some technical details " \
            "about the challenges of building a predictive application, and " \
            "how Turi’s ML platform can help.</p></body></html>")
    >>> Turi is in the business of building the best " \
            "machine learning platform on the planet. Our goal is to make " \
            "it easy for data scientists to build intelligent, predictive " \
            "applications quickly and at scale. Given the perplexing array " \
            "of tools in this space, we often get asked "Why Turi? What " \
            "differentiates it from tools X, Y, and Z?" This blog post aims " \
            "to provide some answers. I’ll go into some technical details " \
            "about the challenges of building a predictive application, and " \
            "how Turi’s ML platform can help.
    See Also
    --------
    - `Boilerpipe project <https://code.google.com/p/boilerpipe/>`_
    - `Boilerpipe Python module <https://pypi.python.org/pypi/boilerpipe>`_
    """
    from boilerpipe.extract import Extractor
    if html and html.strip():
        try:
            extractor = Extractor(extractor='ArticleExtractor', html=html)
            return extractor.getText()
        except Exception as e:
            error = "Function extract_article_content: " + url + " - " + str(e)
예제 #15
0
def scrape(file, split1, split2, urlName):
    links_from_RSS_feed = []
    Requests_from_RSS = requests.get(
        'http://feeds.reuters.com/reuters/businessNews')
    Rss_soup = BeautifulSoup(Requests_from_RSS.text, "html5lib")

    lFile = open(file, "r")
    usedLinks = [line.strip() for line in lFile]
    lFile.close()

    for link in Rss_soup.find_all('guid'):
        links_from_RSS_feed.append(
            str(link.getText().replace('?feedType=RSS&feedName=businessNews',
                                       '')))

    l_file = open(file, "w")
    for item in links_from_RSS_feed:
        l_file.write(str(item) + "\n")
    l_file.close()

    no_of_links = len(links_from_RSS_feed)

    for i in range(0, no_of_links):
        fileName = links_from_RSS_feed[i].rsplit('/', split1)[split2]
        extractedText = Extractor(extractor='ArticleExtractor',
                                  url=urlName + fileName)
        print(fileName)
        write_file = open("Data/" + str(i) + ".txt", "w")
        write_file.write(str(datetime.date.today()) + "\n")
        write_file.write(str(extractedText.getText().encode("utf-8")))
        write_file.close()
    return no_of_links
예제 #16
0
 def scrap_link_boilerpipe(url):
     try:
         extractor = Extractor(extractor='ArticleSentencesExtractor',
                               url=url)
         return extractor.getText()
     except:
         return False
예제 #17
0
def scrape(feed, used, excep, split1, split2, urlName, nameF):
	arrLinks = []
	req = requests.get('http://feeds.reuters.com/reuters/businessNews')
	soupRss = BeautifulSoup(req.text, "html5lib")

	logrFile = open(used,"r")
	usedLinks = [line.strip() for line in logrFile]
	logrFile.close()

	for link in soupRss.find_all('guid'):
		arrLinks.append(str(link.getText().replace('?feedType=RSS&feedName=businessNews', '')))

	log_file = open(used,"w")
	for item in arrLinks:
		log_file.write(str(item)+"\n")
	log_file.close()

	for i in range(0, 8):
		fileName = arrLinks[i].rsplit('/', split1)[split2]
		#if any(fileName in s for s in usedLinks):
		#	print fileName +" has been extracted."
		#else:
		extractedText = Extractor(extractor='ArticleExtractor', url=urlName+fileName)
		print fileName
		write_file = open("Data/"+str(i)+".txt","w")
		write_file.write(str(datetime.date.today()) + "\n")
		write_file.write(str(extractedText.getText().encode("utf-8")))
		write_file.close()
예제 #18
0
    def extract_body_with_boilerpipe(html):
        """
        Extractor types:
                DefaultExtractor
                ArticleExtractor
                ArticleSentencesExtractor
                KeepEverythingExtractor
                KeepEverythingWithMinKWordsExtractor
                LargestContentExtractor
                NumWordsRulesExtractor
                CanolaExtractor
        Reference: https://github.com/misja/python-boilerpipe
        Note: set JAVA_HOME if import fails

        Returns
        --------
        str: extracted body text. Return empty string if extraction fails
        """
        try:
            extractor = Extractor(extractor='KeepEverythingExtractor',
                                  html=html)
            extracted_text = extractor.getText()
        except:
            print "Failed to extract text with boilerpipe"
            extracted_text = ""

        return extracted_text
예제 #19
0
 def process_item(self, html_page):
     try:
         publish_date = examine(html_page['html'])
         from boilerpipe.extract import Extractor
         extractor = Extractor(extractor='ArticleExtractor',
                               html=html_page['html'])
         body = str(extractor.getText())
         title = str(extractor.source.getTitle())
         art = {
             'title': title,
             'body': body,
             'lang': self.lang,
             'source': html_page['source'],
             'url': html_page['url'],
             'crawl_date': html_page['timestamp'],
             'publish_date': publish_date,
             'article_id': sha1(html_page['url'].encode('utf-8')).hexdigest(),
             'sentences': []
         }
         if self.art_ok(art['body']):
             content = art['body']
             content = content.replace(u'\xa0', u' ')
             content = content.replace('\\n', '\n')
             sents = []
             if self.lang == 'en':
                 sents = sent_tokenize(content)
             else:
                 for para in content.split('\n'):
                     sents += sentence_split(para, self.lang)
                 sents = [sent for sent in sents if self.check_sent(sent)]
             art['sentences'] = sents
             if len(sents) >= 3:
                 self.output_corpus.add_instance(art)
     except Exception as e:
         pass
예제 #20
0
def Text_extractor(y, page, team, team_i, counter=0):
    """Extract the text of team pages using BoilerPipe."""
    try:
        upage = urllib.parse.quote_plus(page)
        url = "http://" + y + ".igem.org/wiki/index.php?title=" + upage
        extractor = Extractor(extractor='ArticleExtractor', url=url)
    except:
        counter += 1
        if counter > 10:
            print("Failed to get the text for page {}".format(page))
            return None
        Text_extractor(y, page, team, team_i, counter=counter)
    f = open(
        'results/%s/%s/%s_-_-_CONTENT.html' %
        (y, team, page.replace('/', '#')), 'w')
    f.write(extractor.getHTML())
    f.close()
    f = open(
        'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')),
        'w')
    f.write(extractor.getText())
    f.close()
    path = 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#'))
    # text = text.replace('\\n', '\\\\n')
    output = '%s\t%s\t%s\t%s\n' % (y, str(teams_id[team_i]), page, path)
    teams_pages_text_db.write(output)
예제 #21
0
def scrape(feed, used, excep, split1, split2, urlName, nameF):
    arrLinks = []
    req = requests.get('http://feeds.reuters.com/reuters/businessNews')
    soupRss = BeautifulSoup(req.text, "html5lib")
    # Checks list of already queried links
    logrFile = open(used, "r")
    usedLinks = [line.strip() for line in logrFile]
    logrFile.close()
    # Extracts links from inital feed, excluding non-news
    for link in soupRss.find_all('guid'):
        arrLinks.append(
            str(link.getText().replace('?feedType=RSS&feedName=businessNews',
                                       '')))
    # Store currently extracted links as not to repeat
    log_file = open(used, "w")
    for item in arrLinks:
        log_file.write(str(item) + "\n")
    log_file.close()
    # Extracts stripped news content with timestamp, omitting used links
    for item in arrLinks:
        fileName = str(item.rsplit('/', split1)[split2])
        if any(fileName in s for s in usedLinks):
            print fileName + " has been extracted."
        else:
            extractedText = Extractor(extractor='ArticleExtractor',
                                      url=urlName + fileName)
            print fileName + ": New"
            write_file = open("extractedFiles/" + nameF + fileName + ".txt",
                              "w")
            write_file.write(str(datetime.date.today()) + "\n")
            write_file.write(str(extractedText.getText().encode("utf-8")))
            write_file.close()
예제 #22
0
파일: extract.py 프로젝트: remram44/memex
def get_text_boilerpipe(html_text):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html_text)
        return extractor.getText()
    except:
        print "Exception"
        return None
예제 #23
0
    def extract_main_text(self):
        if self.res is None:
            return None

        extractor = Extractor(  # extractor='ArticleExtractor',
            url=self.url)
        return [extractor.getText()]
예제 #24
0
파일: scrap.py 프로젝트: remram44/memex
def extract_text(html_content):
  try:
    extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
    #print extractor.getText()
    return extractor.getText()
  except:
    print "Exception in html extraction"
    return None
def html2text_bp(html):
    text = None
    try:
        extractor = Extractor(extractor=extractor_type, html=html)
        text = extractor.getText()
    except:
        traceback.print_exc()
    return text
def get_text_boilerpipe(url):
    try:
        extractor = Extractor(extractor='ArticleExtractor', url=url)
        extracted_text = extractor.getText()
    except BaseException as error:
        extracted_text = 'error: {}'.format(error)
        print('error: {}'.format(error))
    return extracted_text
예제 #27
0
def sentences_from_urls(url: str, extractor_name=EXTRACTORS[0], model=MODELS[0],
                        min_words=0, with_proba=False, return_raw=False):
    extractor = Extractor(extractor=extractor_name)
    model = models[model]
    extracted_text = extractor.getTextBlocks(url=url)
    if len(extracted_text) > 0:
        func = model.predict_proba if with_proba else model.predict
        return func(extracted_text, min_words=min_words, return_raw=return_raw)
def getBoilerPlate(url):
    #url = 'http://cnn.com/2016/07/17/health/south-africa-meerkat-telescope-galaxies/index.html'
    try:
        extractor = Extractor(extractor='ArticleExtractor', url=url)
        extracted_text = extractor.getText().replace('\n', '')
        return '', extracted_text
    except Exception, e:
        return '', ''
예제 #29
0
 def ParserBoilerDefault(html_object):
     extractor = Extractor(extractor='DefaultExtractor', html=html_object)
     sents = extractor.getText()
     try:
         sents = list(nlp(sents).sents)
         return sents
     except Exception as e:
         return
예제 #30
0
 def ParserBoilerArticle(html_object):
     extractor = Extractor(extractor='ArticleSentencesExtractor', html=html_object)
     sents = extractor.getText()
     try:
         sents = list(nlp(sents).sents)
         return sents
     except Exception as e:
         return