Exemple #1
0
def show_article():
    url_to_clean = request.args.get('url_to_clean')
    text_to_clean = request.args.get('text_to_clean')
    # if not url_to_clean:
    #     return redirect(url_for('index'))

    article = Article(url_to_clean)
    article.download()
    article.parse()

    try:
      html_string = ElementTree.tostring(article.clean_top_node)
    except:
      html_string = "Error converting html to string."

    try:
      # OVERWRITE ARTICLE.TEXT AS TEXT BEFORE NLP
      article.text = text_to_clean;
      article.nlp()
    except:
      log.error("Couldn't process with NLP")

    a = {
          'html': html_string,
         'authors': str(', '.join(article.authors)),
         'title': article.title,
         'text': article.text,
         'top_image': article.top_image,
         'videos': str(', '.join(article.movies)),
         'keywords': str(', '.join(article.keywords)),
         'summary': article.summary
         }
    return render_template('article/index.html', article=a, url=url_to_clean)
Exemple #2
0
def extract_data(url, bert_summary):
    article = Article(url)
    print("article object created")
    article.download()
    print("download completed")
    article.parse()
    print("parsing completed")

    # Not always article extracts correctly text from the HTML. In case text has not been extracted using alternative
    # way.
    if not article.text or len(article.text) < 100:
        print("looks like article text is not extracted")
        article.text = extractor.get_content(article.html)

    top_image = article.top_image
    title = article.title

    if bert_summary:
        print("extracting bert summary")
        summary = extract_bert_summary(article.text)
    else:
        print("extracting short summary")
        summary = extract_short_summary(article)

    return summary, top_image, title
    def loadSummaryPage(self, url):

        article = Article(url)
        article.download()
        article.parse()

        article.nlp()
        article.text = article.summary

        article.text = article.text.replace("\n", "<br>")

        authors = ""
        for author in article.authors:
            authors = authors + author + ' '

        os.system("rm -r /tmp/somefile.html")
        with open('/tmp/somefile.html', 'a') as the_file:
            the_file.write('<!DOCTYPE html><html><head><meta')
            the_file.write('charset="utf-8"><meta')
            the_file.write(
                'name="viewport" content="width=device-width, initial-scale=1"><title>'
                + article.title.encode('ascii', 'ignore') + '</')
            the_file.write(
                'title><style type="text/css">body{margin:40px auto;')
            the_file.write(
                'max-width:650px;line-height:1.6;font-size:18px;color:#444;padding:0'
            )
            the_file.write(
                '10px}h1,h2,h3{line-height:1.2}</style></head><body><header><h1>'
            )
            the_file.write(article.title.encode('ascii', 'ignore') + '</h1>')
            the_file.write('<aside>' + str(authors) + '</aside>')
            the_file.write('</header><br />')
            the_file.write('<img src="' + article.top_image +
                           '" width="100%">')

            the_file.write('<p><strong>Keywords: </strong><i>')
            for i in range(0, 5):
                the_file.write(str(article.keywords[i]) + ', ')
            the_file.write(str(article.keywords[6]))
            the_file.write('</i></p>')
            the_file.write('<p>' + article.text.encode('ascii', 'ignore') +
                           '</p>')
            the_file.write('</body>')

            url = QUrl('file:///' + '/tmp/' + '/somefile.html')
            self.webView.load(url)
def get_article_info(memento_url, dt, uri_id, base_dir):
    print memento_url    
    article = Article(memento_url)
    html = get_uri_offline_data(dt, uri_id, "html", base_dir)
    article.download(html)
    article.parse()
    text = get_uri_offline_data(dt, uri_id, "txt", base_dir)
    if text != None:
        article.text = text
    article.nlp()
    return article
def get_article_info(memento_url, dt, uri_id, base_dir):
    print memento_url    
    article = Article(memento_url)
    html = get_uri_offline_data(dt, uri_id, "html", base_dir)
    article.download(html)
    article.parse()
    text = get_uri_offline_data(dt, uri_id, "txt", base_dir)
    if text != None:
        article.text = text
    article.nlp()
    return article
Exemple #6
0
def make_content(url_list, news_content_list,  content_summarize_list, title_list):
    for url in url_list:
        try:
            kkma = Kkma()
            news =Article(url, language = 'ko')
            news.download()
            news.parse()
            title_list.append(news.title)
            news.text = kkma.sentences(news.text)
            news.text = " ".join(news.text)
            news_content_list.append(news.text)
            # print(news.text)
            # print(type(news.text))
            summary_content = summarize(news.text, word_count=100, ratio= 0.5)
            if summary_content:
                content_summarize_list.append(summary_content)
            else:
                content_summarize_list.append("요약 할 기사의 내용이 없습니다.")
        except Exception as e:
            print("exceptions is ", e)
            pass
def getSummary(url):
    article = Article(url)
    article.download()
    article.parse()
    doc = ''.join(
        [s for s in article.text.splitlines(True) if s.strip('\r\n')])
    # doc = re.sub('[^A-Za-z .-]+', ' ' , doc)
    doc = ' '.join(doc.split())
    doc = doc.replace('\n', ' ')
    doc = coref_resol(doc)
    doc = doc.replace('\n', ' ')
    article.text = doc
    article.nlp()
    return (article.summary.replace('\n', ' '))
Exemple #8
0
def get_article_summary(url):
    article = Article(url)

    try:
        article.download()
    except Exception:
        return ''

    article.parse()

    #Clean unrecognized unicode from article
    article.text = article.text.encode('ascii', 'ignore').decode('utf-8')

    return text_summarizer(article.text)
Exemple #9
0
def get_articles(urls):
    global site
    global error_list
    error_list = []
    for x in range(len(urls)):
        #        try :
        url1 = urls[x]
        print('|{}{}|'.format(
            int(x / len(urls) * 10) * '-',
            (10 - int(x / len(urls) * 10)) * ' '))
        print('({}%)'.format(x / len(urls) * 100))
        #while True:
        art = Article(url1, language='fr')
        print('Downloading...')
        art.download()
        art.parse()
        print('Processing...')
        html = str(art.html)
        tag1 = html.find('|')
        tag2 = html.find('|', tag1 + 1)
        name = html[tag1 + 2:tag2 - 1]
        if '<' in name:
            name = 'NaN'
        if ',' in name:
            name = name[:name.find(',')]

        if art.text.startswith('('):
            city = art.text[art.text.find('(') + 1:art.text.find(')')]
            art.text = art.text[art.text.find(')') + 2:]
        else:
            city = 'NaN'
        print('Appending')
        site1 = {
            'LEN_TXT': len(art.text.split()),
            'JOURNAL': 'La Presse',
            'CITY': city,
            'URL': urls[x],
            'TEXTE': art.text,
            'TITRE': art.title,
            'DATE': '{}'.format(art.publish_date),
            'AUTHORS': name,
            'META-KEYWORDS': art.meta_keywords,
            'META-DESCRIPTION': art.meta_description
        }
        site['ID {}-{}'.format(art.publish_date, x)] = site1
        #       except:
        error_list.append(url1)
        continue
    return (site)
Exemple #10
0
def home(url):

    data = {}
    data['url'] = url

    # Validate url
    if urlparse.urlparse(url).scheme not in ('http', 'https'):
        data['error'] = 'Invalid URL'
        return json.dumps(data)

    a = Article(url)
    a.download()
    a.parse()

    data['title'] = a.title
    data['authors'] = a.authors
    data['text'] = a.text

    try:
        a.nlp()
    except UnicodeDecodeError:
        # Strip non-ascii characters
        a.title = to_ascii(a.title)
        a.text = to_ascii(a.text)
        a.nlp()

    # NLP
    data['summary'] = a.summary
    data['keywords'] = a.keywords
    data['tags'] = list(a.tags)

    # Media
    data['top_image'] = a.top_image
    data['images'] = a.images
    data['movies'] = a.movies

    # Meta
    data['source_url'] = a.source_url
    data['published_date'] = a.published_date

    data['meta_img'] = a.meta_img
    data['meta_keywords'] = a.meta_keywords
    data['meta_lang'] = a.meta_lang

    return json.dumps(data)
Exemple #11
0
def get_article(url, config=Config()):
    pdf_defaults = {
        "application/pdf": "%PDF-",
        "application/x-pdf": "%PDF-",
        "application/x-bzpdf": "%PDF-",
        "application/x-gzpdf": "%PDF-"
    }
    article = Article(url,
                      request_timeout=20,
                      ignored_content_types_defaults=pdf_defaults,
                      config=config)
    article.download()
    # uncomment this if 200 is desired in case of bad url
    # article.set_html(article.html if article.html else '<html></html>')
    article.parse()
    if article.text == "" and article.html != "%PDF-":
        paper = build(url, memoize_articles=False, fetch_images=False)
        article.text = paper.description
    return article
Exemple #12
0
    def build_news_article_from_url(source_url, sNLP):
        """build new article object from source url, if build fail would return None
        """
        try:
            print('start to scrape from url: ', source_url)

            # pre-process news by NewsPaper3k and Boilerpipe library
            article = Article(source_url, keep_article_html=True)
            article.build()
            article.nlp()
            e = Extractor(extractor='DefaultExtractor', html=article.html)
            article.text = e.getText()
            article.article_html = e.getHTML()

            news_article = NewsArticle(article, sNLP)
            print('success to scrape from url: ', source_url)
            return news_article
        except Exception as e:
            print('fail to scrape from url: ', source_url)
            print('reason:', e)
            return None
Exemple #13
0
def parse_article(url):
  '''
  Responsible for parsing a single article.
  '''
  article = Article(url)

  print("Download data of URL: {}".format(url))

  article.download()

  # Fallback, otherwise the program would exit on the first invalid URL
  try:
    article.parse()
  except newspaper.article.ArticleException:
    print("Oops! The URL '{}' seems inaccessible!".format(url))

    article.authors = ['<UNK>']
    article.text = '<UNK>'

    return article

  return article
Exemple #14
0
    df['url'][i] = all_articles['articles'][i]['url']
    df['imgurl'][i] = all_articles['articles'][i]['urlToImage']
    url = all_articles['articles'][i]['url']
    res = requests.get(url)
    html_page = res.content
    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)
    output = ''
    for t in text:
        if t.parent.name not in blacklist:
            output += '{} '.format(t)
    #translated = translator.translate(output,src='it',dest='en')
    article = Article(url)
    article.download()
    article.parse()
    article.text = output.strip()
    article.nlp()
    df['text'][i] = article.text
    df['summary'][i] = article.summary

for i in range(len(all_articles1['articles'])):
    df1['title'][i] = all_articles1['articles'][i]['title']
    df1['source'][i] = all_articles1['articles'][i]['source']['name']
    df1['url'][i] = all_articles1['articles'][i]['url']
    df1['imgurl'][i] = all_articles1['articles'][i]['urlToImage']
    url = all_articles1['articles'][i]['url']
    res = requests.get(url)
    html_page = res.content
    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)
    output = ''