Exemple #1
0
    def wrap_newspaper(self, web_page):
        parser = NewspaperArticle(url=web_page.final_url)
        parser.html = web_page.html
        parser.is_downloaded = True
        parser.parse()

        return parser
Exemple #2
0
    def wrap_newspaper(self, web_page):
        parser = NewspaperArticle(url=web_page.final_url)
        parser.html = web_page.html
        parser.is_downloaded = True
        parser.parse()

        return parser
Exemple #3
0
def parse_article(url, min_words_count=jg.MIN_WORDS_TO_SCRAPE):
    """ We download an article by ourselves so that we do it behind the Tor
    network and with a random user agent (Don't let Newspaper do it!).
    Then we fool Newspaper to think that it was the one who downloaded it so we
    can parse it and return the article.

    Returns None if the article is smaller than min_words_count.
    """

    try:
        response = get_page(url)
    except Exception as err:
        update_log.error('Error in get_page()')
        update_log.error(err)
        return None

    if response is not None:
        article = ArticleParser(url="http://something")
        article.html = response.content
        article.download_state = 2

        try:
            article.parse()
        except Exception as err:
            update_log.error('Error in article.parse()')
            update_log.error(err)
            return None
        else:
            add_url_to_blacklist(url)
            if len(article.text.split(' ')) >= min_words_count:
                return article

    return None
Exemple #4
0
def today_helper(article_num, body, article_total):
    try:
        article_name = "article_{:04d}".format(article_num)
        print(">>> Processing: {}/{}".format(article_num, article_total),
              end='\r')

        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(body), parser).getroot()
        result = ''

        for p in tree.iter('p'):
            result += etree.tostring(p, method="html").decode("utf-8")

        article = Article(body)
        article.download()
        article.html = result
        article.parse()

        body = article.text
        words = body.split()
        text = ' '.join(words)

    except:
        print(">>> Broken link, skipping URL {}".format(article_num))
        return

    out_dir = data_path + config['scraping']['folder']
    filename = "{}{}.txt".format(out_dir, article_name)

    with open(filename, 'w') as f:
        f.write(text)
    def _get_content_from_url(self, url):
        """Takes in a single url and return article content and title"""
        #r = requests.get(url)

        try:
            r = requests.get(url,timeout=6)
            # print('successful!')
        except requests.exceptions.Timeout as e:
            # Maybe set up for a retry
            print(e)
            return ' ', ' '
        except requests.exceptions.RequestException as e:
            print(e)
            return ' ', ' '

        # save to file
        with open('file.html', 'wb') as fh:
            fh.write(r.content)
        #print('Running Article...')
        a = Article(url)

        # set html manually
        with open("file.html", 'rb') as fh:
            a.html = fh.read()
        #print('Done opening Article.html...')
        # need to set download_state to 2 for this to work
        a.download_state = 2

        a.parse()
        
        title = a.title
        content = re.sub("\n\n"," ",a.text)
        # Now the article should be populated
        return content, title
Exemple #6
0
def run_newspaper(htmlstring):
    '''try with the newspaper module'''
    ## does not work!
    myarticle = Article('https://www.example.org/test/')
    myarticle.html = htmlstring
    myarticle.download_state = ArticleDownloadState.SUCCESS
    myarticle.parse()
    if myarticle.publish_date is None:
        return None
    date = convert_date(myarticle.publish_date, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d')
    return date
def newspaper_test():
    from newspaper import fulltext, Article
    for i in range(0, 1000):
        input_filename = 'page/' + str(i) + '.txt'
        output_filename = 'newspaper/' + str(i) + '.txt'
        input_file = open(input_filename, 'r')
        s = input_file.read()
        input_file.close()
        a = Article(language='zh')
        a.html = s
        a.parse()
        # print a.text
        raw_input('wait')
Exemple #8
0
def run_newspaper(htmlstring):
    '''try with the newspaper module'''
    # throws error on the eval_default dataset
    try:
        myarticle = Article(htmlstring)
    except (TypeError, UnicodeDecodeError):
        return None
    myarticle.html = htmlstring
    myarticle.download_state = ArticleDownloadState.SUCCESS
    myarticle.parse()
    if myarticle.publish_date is None or myarticle.publish_date == '':
        return None
    return convert_date(myarticle.publish_date, '%Y-%m-%d %H:%M:%S',
                        '%Y-%m-%d')
Exemple #9
0
def extract_data(url, bert_summary):
    article = Article(url)
    print("article object created")
    article.download()
    if article.download_state != ArticleDownloadState.SUCCESS:
        article.html = urllib.request.urlopen(url).read()
        # Hacking the library
        article.download_state = ArticleDownloadState.SUCCESS
    print("download completed")
    article.parse()
    print("parsing completed")

    top_image = article.top_image
    title = article.title

    if bert_summary:
        print("extracting bert summary")
        summary = extract_bert_summary(article.text)
    else:
        print("extracting short summary")
        summary = extract_short_summary(article)

    return summary, top_image, title
Exemple #10
0
def cna_helper(article_num, url, article_total):
    try:
        article_name = "article_{:04d}".format(article_num)
        print(">>> Processing: {}/{}".format(article_num, article_total),
              end='\r')

        article = Article(url, "english")
        article.download()

        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(article.html), parser).getroot()
        result = ''

        for div in tree.iter('div'):
            if 'class' in div.attrib and 'c-rte--article' in div.attrib[
                    'class']:
                for p in div.iter('p'):
                    result += etree.tostring(p, method="html").decode("utf-8")
                break

        article.html = result
        article.parse()

        body = article.text
        words = body.split()
        text = ' '.join(words)

    except:
        print(">>> Broken link, skipping URL {}".format(article_num))
        return

    out_dir = data_path + config['scraping']['folder']
    filename = "{}{}.txt".format(out_dir, article_name)

    with open(filename, 'w') as f:
        f.write(text)
Exemple #11
0
def get_text_date(url):
    try:
        article = Article(url)
        article.download()
        article.html = re.sub(r"\n+", " ", article.html)
        article.html = re.sub(
            r"<blockquote class=\"twitter-tweet\".+?</blockquote>", "",
            article.html)
        article.html = re.sub(
            r"<blockquote class=\"instagram-media\".+?</blockquote>", "",
            article.html)
        article.html = re.sub(
            r"<blockquote class=\"tiktok-embed\".+?</blockquote>", "",
            article.html)
        article.html = re.sub(r"<blockquote cite=\".+?</blockquote>", "",
                              article.html)
        #article.html = re.sub(r"<h2 class=\"mce\">&middot.+?</p>", "", article.html) # subtitulares de vertele
        article.html = re.sub(r"<figcaption.+?</figcaption>", "", article.html)
        article.parse()
        return article.text, article.publish_date
    except newspaper.article.ArticleException:
        return None, None
Exemple #12
0
def get_text_date(url):
    try:
        article = Article(url)
        article.download()
        if "Noticia servida automáticamente por la Agencia EFE" in article.html:
            return None, None
        article.html = re.sub(r"\n+", " ", article.html)
        article.html = re.sub(
            r"<blockquote class=\"twitter-tweet\".+?</blockquote>", "",
            article.html)
        article.html = re.sub(
            r"<blockquote class=\"instagram-media\".+?</blockquote>", "",
            article.html)
        article.html = re.sub(
            r"<blockquote class=\"tiktok-embed\".+?</blockquote>", "",
            article.html)
        article.html = re.sub(r"<blockquote cite=\".+?</blockquote>", "",
                              article.html)
        #article.html = re.sub(r"<h2 class=\"mce\">&middot.+?</p>", "", article.html) # subtitulares de vertele
        article.html = re.sub(r"<figcaption.+?</figcaption>", "", article.html)
        article.html = re.sub(
            r"<p><em>Si alguien te ha reenviado esta carta.+?</em></p>", "",
            article.html)  # Matrioska de verne
        article.html = re.sub(
            r"<p class=\"\">(<b>)?Información sobre el coronavirus(</b>)?.+?ante la enfermedad</a></p>",
            "", article.html)  # El Pais nuevo pie coronavirus
        article.html = re.sub(
            r"<p class=\"\">(<b>)?Información sobre el coronavirus(</b>)?.+?sobre la pandemia.*?</p>",
            "", article.html)  # El Pais viejo pie coronavirus
        article.html = re.sub(r"<p class=\"\">.*?Suscríbase aquí.*?</p>", "",
                              article.html)  # newsletter El País
        article.html = re.sub(r"<a[^>]+>Apúntate a .*?</a>", "",
                              article.html)  # newsletter 20 minutos
        article.html = re.sub(r"<p[^>]+>Apúntate a .*?</p>", "",
                              article.html)  # newsletter 20 minutos
        article.html = re.sub(
            r"<span class=\"datos-articulo\".+?</div><p class=\"enviaremailerr captcha\">",
            "", article.html)
        article.html = re.sub(r"<aside class=\"modulo temas\".+?</aside>", "",
                              article.html)
        article.html = re.sub(r"Si quieres seguir recibiendo.+?</p>", "",
                              article.html)
        article.html = re.sub(r"<p class=\"siguenos_opinion\">.+?</p>", "",
                              article.html)
        article.html = re.sub(r"<p><a.+?<em>playlists</em> de EL PAÍS</a></p>",
                              "", article.html)
        article.html = re.sub(r"<section class=\"more_info .+?</section>", "",
                              article.html)
        article.html = re.sub(r"<span class=\"EPS-000.+?eps</span>", "",
                              article.html)
        article.html = re.sub(
            r"<span class=\"f_a | color_black uppercase light.+?</span>", "",
            article.html)
        article.html = re.sub(r"<i>Puedes seguir a .+?[nN]ewsletter.?</i>", "",
                              article.html)  # pie de Materia
        article.html = re.sub(r"Puedes seguir a .+?(<i>)? *[nN]ewsletter</a>",
                              "", article.html)  # pie de Materia
        article.html = re.sub(
            r"<i>Puedes seguir a .+?(<i>)? *[nN]ewsletter</i></a>", "",
            article.html)  # pie de Materia
        article.html = re.sub(
            r"<i>Puedes escribirnos a .+?[Nn]ewsletter</i></a>", "",
            article.html)  # pie de Materia nuevo
        article.html = re.sub(r"<p><em><strong>¿Nos ayudas?.+?</p>", "",
                              article.html)  # Kiko Llaneras
        article.html = re.sub(
            r"<p class=\"nota_pie\".+?a nuestra <em>newsletter</em>\.?(</span>)*</p>",
            "", article.html)  # pie de Planeta Futuro
        article.html = re.sub(
            r"<i>Puedes escribirnos a.+?<i>[nN]ewsletter</i></a>", "",
            article.html)  # pie de Materia
        article.html = re.sub(r"<p class="
                              "><i>Puedes escribirnos a.+?</p>", "",
                              article.html)
        article.html = re.sub(
            r"<i>Lee este y otros reportajes.+?con EL PAÍS.</i>", "",
            article.html)  # pie Buenavida EL PAIS
        article.html = re.sub(
            r"<h3 class=\"title-related\">.+?</div>", "",
            article.html)  # noticias relacionadas en El Confi
        article.html = re.sub(
            r"<button.+?</button>", "",
            article.html)  # botones de compartir en elpais icon
        article.html = re.sub(r"<p class=\"g-pstyle.+?</p>", "", article.html)
        article.html = re.sub(r"<p class=\"nota_pie\">.+?</p>", "",
                              article.html)
        article.html = re.sub(r"<strong>Apúntate a la .+?</strong>", "",
                              article.html)
        article.html = re.sub(r"<p><strong>O súmate a .+?</strong></p>", "",
                              article.html)
        #article.html = re.sub(r"<h2.*?>¿En qué se basa todo esto\?</h2>.*</div>", "", article.html)
        article.html = re.sub(
            r"<strong>M&aacute;s en tu mejor yo</strong>: <a.*?</a>", "",
            article.html)
        article.html = re.sub(r"<p class=\"article-text\"> +<a.*?</a>", "",
                              article.html)
        article.html = re.sub(
            r"<span>Este sitio web utiliza cookies propias.+?</span>", "",
            article.html)
        article.html = re.sub(r"\[LEER MÁS:.+?\]", "", article.html)
        article.html = re.sub(r"<div id=\"post-ratings-.+?Cargando…</div>", "",
                              article.html)  # rating EFE
        article.html = re.sub(
            r"<div id=\"div_guia\" class=\"guia\" itemprop=\"alternativeHeadline\">.+?</div>",
            "", article.html)  # subtitulo EFE
        article.html = re.sub(
            r"<div class=\"f f__v video_player.+?</div></div></div>", "",
            article.html)
        article.html = article.html.replace("<em class=\"mce\">", "<em>")
        article.html = re.sub("([^ ])<em>", "\g<1> <em>", article.html)
        article.html = article.html.replace("<em> ", "<em>")
        article.html = re.sub("([^ ])<i>", "\g<1> <i>", article.html)
        article.html = article.html.replace("<i> ", "<i>")
        article.html = article.html.replace(" </em>", "</em>")
        #article.html = re.sub("</em>([^ \W])", "</em> \g<1>", article.html)
        article.html = re.sub("</em>([^\s\.,;:])", "</em> \g<1>", article.html)
        article.html = article.html.replace(" </i>", "</i>")
        article.html = re.sub("</i>([^\s\.,;:])", "</i> \g<1>", article.html)
        article.html = article.html.replace("<em>", "'")
        article.html = article.html.replace("</em>", "'")
        article.html = article.html.replace("<i>", "'")
        article.html = article.html.replace("</i>", "'")
        article.parse()
        """
		if article.meta_description:
			article.text = article.meta_description + "\n\n" + article.text
		"""
        return article.text, article.publish_date
    except newspaper.article.ArticleException:
        return None, None
Exemple #13
0
def scrape(url):
    """
    Scrapes an article from the 'url', extracts meta data using Nespaper3K package
    
    Parameters:
    --------
    url         : str, url to scrape
    
    Returns:
    --------
    doc         : dict,
        {
            'url'      : url,
            'date'     : article publish_date,
            'title'    : article title,
            'text'     : article cleaned_text,
            'keywords' : article meta_keywords,
            'summary'  : article summary
        }
    False       : bool, if get request fails or html < 500
    """
    from newspaper import Article, Config
    import re

    logger.info(f"SCRAPE: trying {url}")
    config = Config()
    config.memoize_articles = False
    config.fetch_images = False
    config.language = 'en'
    config.browser_user_agent = get_ua()
    config.request_timeout = 5
    config.number_threads = 8

    response = get_html_from_url(url)

    if response['status_code'] and response['html']:
        try:
            article = Article(url=url, config=config)
            article.download_state = 2
            article.html = response['html']
            article.parse()
            article.nlp()

            words_count = len((article.text).split())

            if words_count > 200:
                logger.info(
                    f'SCRAPE: Extracted TEXT from URL: {url}\n Title: "{article.title}"'
                )
                return {
                    'url': url,
                    'datetime': article.publish_date,
                    'title': article.title,
                    'text': " ".join(re.split(r'[\n\t]+', article.text)),
                    'keywords': article.keywords,
                    'summary': article.summary
                }
            else:
                logger.info(f'''SCRAPE: Could not extract TEXT from {url}\n 
                    Article too short: {words_count} words''')
        except Exception as e:
            logger.info(
                f'SCRAPE: Could not extract TEXT from {url}\n Error: {e}')
    else:
        logger.info(f'SCRAPE: Could not extract TEXT from {url}')
    return False
Exemple #14
0
    #cast string to list
    source_list = ast.literal_eval(e['source_list'])

    #finds which is the position of the o_url in the list (we will need that to retrieve the correct .html)
    o_idx = source_list.index(o_url)
    a = Article(o_url)

    #finds the html file
    article_alias = a_url.rstrip("/").split("/")[-1]
    article_folder = html_folder+"/"+article_alias
    o_html_filename = article_folder+"/"+str(o_idx)+".html"

    # set html manually
    with open(o_html_filename, 'rb') as fh:
        a.html = fh.read()
        # need to set download_state to 2 for this to work
        a.download_state = 2
        a.parse()
        # Now the article should be populated
        print(a.text)


gold_df.to_csv(cwd+"/datasetVeritas3.csv", index=False)

print("average number of annotations per doc:", sum(lenlen)/len(lenlen))
lenlen.sort(reverse = True)
print(lenlen[:200])
print("max num of annotations on the same source")
print(max(lenlen))
print("NEW")
Exemple #15
0
    def parse_article(self, response):
        news_id = 19684  #response.meta.get('news_id')

        # save to file
        with open(str(news_id) + '.html', 'wb') as fh:
            fh.write(response.body)
        article = Article(response.url)
        # set html manually
        with open(str(news_id) + '.html', 'rb') as fh:
            article.html = fh.read()
        os.remove(str(news_id) + '.html')
        # need to set download_state to 2 for this to work
        article.download_state = 2
        article.parse()
        article.nlp()
        date = article.publish_date
        keywords = str([x.replace("'", "''")
                        for x in article.keywords]).replace('"', '\'')
        content = article.text.replace("'", "''")
        summary = article.summary.replace("'", "''")
        title = article.title.replace("'", "''")
        if date is None:
            date = 'null'
        else:
            date = "'" + str(date) + "'"
        authors = str([x.replace("'", "''")
                       for x in article.authors]).replace('"', '\'')
        tags = str([x.replace("'", "''")
                    for x in article.meta_keywords]).replace('"', '\'')

        dbconnector.execute(
            self.conn,
            'INSERT INTO "ParsedNews-newspaper"("IDNews", "Date", "Content", "Keywords", '
            + '"Summary", "Authors", "Tags", "Title") ' + 'VALUES (' +
            str(news_id) + ', ' + str(date) + ', \'' + content + '\', ARRAY ' +
            str(keywords) + '::text[], \'' + summary + '\', ARRAY ' +
            str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], \'' +
            title + '\')')

        # get main article without comments
        content = extract_content(response.text).replace("'", "''")

        # get article and comments
        content_comments = '[\'' + extract_content_and_comments(
            response.text).replace("'", "''") + '\']'

        dbconnector.execute(
            self.conn,
            'INSERT INTO "ParsedNews-dragnet"("IDNews", "Content", "Comments") '
            + 'VALUES (' + str(news_id) + ', \'' + content + '\', ARRAY ' +
            str(content_comments) + '::text[])')

        date = articleDateExtractor.extractArticlePublishedDate(
            articleLink=response.url, html=response.text)
        if date is not None:
            dbconnector.execute(
                self.conn, 'INSERT INTO "ParsedNews-ade"("IDNews", "Date") ' +
                'VALUES (' + str(news_id) + ', \'' + str(date) + '\')')

        g = Goose()
        article = g.extract(raw_html=response.text)
        date = article.publish_datetime_utc
        keywords = str([x.replace("'", "''")
                        for x in article.tags]).replace('"', '\'')
        content = article.cleaned_text.replace("'", "''")
        summary = article.meta_description.replace("'", "''")
        title = article.title.replace("'", "''")
        if date is None:
            date = 'null'
        else:
            date = "'" + str(date) + "'"
        authors = str([x.replace("'", "''")
                       for x in article.authors]).replace('"', '\'')
        tags = str([
            x.replace("'", "''") for x in article.meta_keywords.split(",")
        ]).replace('"', '\'')
        tweets = str([x.replace("'", "''")
                      for x in article.tweets]).replace('"', '\'')

        dbconnector.execute(
            self.conn, 'INSERT INTO "ParsedNews-goose"(' +
            '"IDNews", "Date", "Content", "Keywords", "Summary", ' +
            '"Authors", "Tags", "Tweets",' + '"Title") VALUES (' +
            str(news_id) + ', ' + date + ', \'' + content + '\', ARRAY ' +
            str(keywords) + '::text[], \'' + str(summary) + '\', ARRAY ' +
            str(authors) + '::text[], ARRAY ' + str(tags) +
            '::text[], ARRAY ' + str(tweets) + '::text[], \'' + str(title) +
            '\')')

        pass
Exemple #16
0
import re
from pymongo import MongoClient
from newspaper import Article

client = MongoClient()

db_articles = client.news.articles
db_web_cache = client.news.web_cache

docs = db_articles.find()

for doc in docs:
    print doc['_id']

    if not doc['page']:
        continue

    url = doc['page']['urls'][0]
    web_cache_doc = db_web_cache.find_one({'url': url})

    if 'html_compressed' in web_cache_doc:
        article = Article(url=url)
        article.html = bz2.decompress(web_cache_doc['html_compressed'])
        article.is_downloaded = True
        article.parse()

        doc['page']['text'] = article.text
        print len(doc['page']['text'])

        db_articles.save(doc)
Exemple #17
0
from newspaper import Article

client = MongoClient()

db_articles = client.news.articles
db_web_cache = client.news.web_cache

docs = db_articles.find()

for doc in docs:
    print doc['_id']

    if not doc['page']:
        continue

    url = doc['page']['urls'][0]
    web_cache_doc = db_web_cache.find_one({'url': url})
    
    if 'html_compressed' in web_cache_doc:
        article = Article(url=url)
        article.html = bz2.decompress(web_cache_doc['html_compressed'])
        article.is_downloaded = True
        article.parse()

        doc['page']['text'] = article.text
        print len(doc['page']['text'])

        db_articles.save(doc)


Exemple #18
0
def fetch_main_content(html: str) -> Article:
    a = Article(url='')
    a.html = html
    a.download_state = 2
    a.parse()
    return a