Example #1
0
def retrieve_article(url):
    try:
        config = Configuration()
        config.fetch_images = False

        req = urllib.request.Request(
            url,
            headers={
                'User-Agent':
                "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919"
            })
        con = urllib.request.urlopen(req, timeout=10)
        html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128])

        article = Article(url='', config=config)
        article.set_html(html)
        article.parse()
        text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)])

        if len(text) < 300:
            article = Article(url='', config=config, language="id")
            article.set_html(html)
            article.parse()
            text = ''.join(
                [i if ord(i) < 128 else ' ' for i in str(article.text)])

        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        return text
    except Exception as e:
        print(e)
        return False
Example #2
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = 'cnn'

        config = Configuration()
        config.verbose = False
        s = Source('http://cnn.com', config=config)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        # For this test case and a few more, I don't believe you can actually
        # assert two values to equal eachother because some values are ever changing.

        # Insead, i'm just going to print some stuff out so it is just as easy to take
        # a glance and see if it looks OK.

        print '\t\tWe have %d articles currently!' % s.size()
        print
        print '\t\t%s categories are: %s' % (s.url, str(s.category_urls()))
Example #3
0
    def test_source_build(self):
        """
        builds a source object, validates it has no errors, prints out
        all valid categories and feed urls
        """
        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = "cnn"

        config = Configuration()
        config.verbose = False
        s = Source("http://cnn.com", config=config)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        # For this test case and a few more, I don't believe you can actually
        # assert two values to equal eachother because some values are ever changing.

        # Insead, i'm just going to print some stuff out so it is just as easy to take
        # a glance and see if it looks OK.

        print "\t\tWe have %d articles currently!" % s.size()
        print
        print "\t\t%s categories are: %s" % (s.url, str(s.category_urls()))
Example #4
0
def extract(results):
    try:
        config = Configuration()
        config.fetch_images = False

        req = urllib.request.Request(
            results["url"],
            headers={
                'User-Agent':
                "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020919"
            })
        con = urllib.request.urlopen(req, timeout=10)
        html = ''.join([x for x in map(chr, con.read()) if ord(x) < 128])

        article = Article(url='', config=config)
        article.set_html(html)
        article.parse()
        text = ''.join([i if ord(i) < 128 else ' ' for i in str(article.text)])

        if len(text) < 300:
            article = Article(url='', config=config, language="id")
            article.set_html(html)
            article.parse()
            text = ''.join(
                [i if ord(i) < 128 else ' ' for i in str(article.text)])

        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')

        print("=", end='', flush=True)
        return (results["url"], results["title"], text, article.publish_date)
    except Exception as e:
        print(e)
        return (results["url"], results["title"], None, None)
Example #5
0
def newspaper_fulltext2(parser, language, url):
    '''
    This is a faster version of the function that uses some internal newspaper3k functions
    so that the lxml parse tree doesn't need to be recreated.
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    config.keep_article_html = True
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    doc = document_cleaner.clean(doc)
    doc = extractor.calculate_best_node(doc)
    if doc is not None:
        doc = extractor.post_cleanup(doc)
        text, html = output_formatter.get_formatted(doc)
    else:
        text = ''
        html = ''

    return {
        'value': {
            'text': text,
            'html': html,
        },
        'pattern': 'newspaper3k',
    }
Example #6
0
def newspaper_fulltext(parser, language):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    doc = document_cleaner.clean(doc)
    top_node = extractor.calculate_best_node(doc)
    if top_node is not None:
        top_node = extractor.post_cleanup(top_node)
        text, html = output_formatter.get_formatted(top_node)
    else:
        text = None
        html = None

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'newspaper3k',
        }
Example #7
0
 def test_meta_refresh_no_url_redirect(self):
     config = Configuration()
     config.follow_meta_refresh = True
     article = Article('', config=config)
     html = mock_resource_with('ap_meta_refresh', 'html')
     article.download(input_html=html)
     article.parse()
     self.assertEqual(article.title, 'News from The Associated Press')
Example #8
0
 def test_meta_refresh_redirect(self):
     config = Configuration()
     config.follow_meta_refresh = True
     article = Article(
         '', config=config)
     html = mock_resource_with('google_meta_refresh', 'html')
     article.download(input_html=html)
     article.parse()
     self.assertEqual(article.title, 'Example Domain')
Example #9
0
 def test_meta_refresh_no_url_redirect(self):
     config = Configuration()
     config.follow_meta_refresh = True
     article = Article(
         '', config=config)
     html = mock_resource_with('ap_meta_refresh', 'html')
     article.download(html=html)
     article.parse()
     self.assertEqual(article.title, 'News from The Associated Press')
Example #10
0
 def test_meta_refresh_redirect(self):
     # TODO: We actually hit example.com in this unit test ... which is bad
     # Figure out how to mock an actual redirect
     config = Configuration()
     config.follow_meta_refresh = True
     article = Article('', config=config)
     html = mock_resource_with('google_meta_refresh', 'html')
     article.download(input_html=html)
     article.parse()
     self.assertEqual(article.title, 'Example Domain')
Example #11
0
 def __init__(self):
     self.config = Configuration(
     )  # sets meta config for article and parser
     self.parser = self.config.get_parser()  # parser
     self.extractor = ContentExtractor(
         self.config
     )  # extracts info (author, tags, text, etc.) from parsed article
     self.doc_cleaner = DocumentCleaner(
         self.config)  # cleans unwanted tags and nodes from DOM
     self.formatter = OutputFormatter(
         self.config)  # outputs formatted text from parsed xpath nodes
Example #12
0
 def test_meta_refresh_redirect(self):
     # TODO: We actually hit example.com in this unit test ... which is bad
     # Figure out how to mock an actual redirect
     config = Configuration()
     config.follow_meta_refresh = True
     article = Article(
         '', config=config)
     html = mock_resource_with('google_meta_refresh', 'html')
     article.download(html=html)
     article.parse()
     self.assertEqual(article.title, 'Example Domain')
Example #13
0
def clean(html_content):
    config = Configuration()
    config.fetch_images = False

    # TODO: allow URL passing
    article = Article("http://example.com", config=config)
    article.set_html(html_content)
    article.is_downloaded = True
    article.parse()

    return article.text
Example #14
0
def html_to_article(content, language):
    content = content.strip()
    if not len(content):
        return ''

    config = NewspaperConfig()
    config.language = language

    doc = config.get_parser().fromstring(content.strip())
    if doc is None:
        return ''

    # Split block-level elements with newlines
    for tag in _BLOCKLEVEL_TAGS:
        if tag in _MEANINGLESS_TAGS:
            continue
        for node in doc.xpath('//{}'.format(tag)):
            node.append(etree.Element('br'))
            node.append(etree.Element('br'))

    # Initial cleanup
    cleaner = _NewspaperCleaner(config)
    doc = cleaner.clean(doc)

    # Best node estimation
    extractor = NewspaperExtractor(config)
    top = extractor.calculate_best_node(doc)
    if top is None:
        del doc, cleaner, extractor
        etree.clear_error_log()

        return ''

    top = extractor.post_cleanup(top)

    # Cleanup dummy nodes used for estimation
    for dummy in top.xpath("//p[@newspaper='dummy']"):
        dummy.getparent().remove(dummy)

    # Custom formatting to avoid unnecessary computations
    formatter = NewspaperFormatter(config)
    formatter.top_node = top
    formatter.remove_negativescores_nodes()
    content = formatter.convert_to_html()
    content = str(content).strip()
    content = unescape(content)

    del doc, top, cleaner, extractor, formatter
    etree.clear_error_log()

    return content
Example #15
0
def get_articles_c_tribune(complement):
    '''
    Given a string (complement) of the form 2011/01/01,
    get articles from the Chicago Tribune Archives.

    Inputs: a string called complement containing the date
            for a given day
    Returns: 
            info dictionary for that day
            writes csv file with nltk scores for complement

    '''
    c_tribune = 'http://articles.chicagotribune.com/'
    archive_url = c_tribune + complement + '/'
    articles = {}
    pm = urllib3.PoolManager()
    html = pm.urlopen(url=archive_url, method="GET").data
    soup = bs4.BeautifulSoup(html, 'lxml')
    #print(soup)
    tag_list = soup.find_all('h3')

    if tag_list:
        for index, tag in enumerate(tag_list):
            rv = {}
            articles[index] = rv
            article = c_tribune + tag.a['href']
            #print(article)
            config = Configuration()
            config.browser_user_agent = get_user_agent()
            article_object = Article(article)
            article_object.download()

            if article_object:
                article_object.parse()
                if 'Death Notice:' in article_object.title:
                    continue
                title = article_object.title
                #date = article_object.publish_date
                text = article_object.text
                rv['article'] = title
                rv['pub_date'] = complement
                rv['nltk_score'] = get_nltk_score(text)
                rv['nltk_score_title'] = get_nltk_score(title)
                rv['source'] = 'Chicago Tribune'

            write_csv_pro(
                articles,
                'chicago_tribune_' + re.sub("/", "_", complement) + '.csv')
Example #16
0
    def test_meta_refresh_no_url_redirect(self):
        config = Configuration()
        config.follow_meta_refresh = True
        article = Article(
            '', config=config)
        html = mock_resource_with('ap_meta_refresh', 'html')
        article.download(input_html=html)
        article.parse()
        self.assertEqual(article.title, 'News from The Associated Press')

# Err Fixed: Called before the first test case of this unit begins
        @print_test
        def test_pre_download_parse(self):

            article = Article(self.article.url)
            self.assertRaises(ArticleException, article.parse)
def get_articles_pro(complement):
    '''
    Given a string (complement) of the form 2011/01/01,
    get articles for a given day from ProPublica
    Inputs:
            a string called complement containing the date
            for a given day
            propublica tag_type = 'div'
            propublica class_type = 'excerpt-thumb'
    Returns:
            Dictionary with articles for that day
            Writes csv files with nltk scores
    '''
    propublica = 'https://www.propublica.org/archive/'
    archive_url = propublica + complement + '/'
    articles = {}
    pm = urllib3.PoolManager()
    html = pm.urlopen(url=archive_url, method="GET").data
    soup = bs4.BeautifulSoup(html, 'lxml')
    tag_list = soup.find_all('div', class_='excerpt-thumb')

    if tag_list:
        for index, tag in enumerate(tag_list):
            rv = {}
            articles[index] = rv
            article = tag.a['href']
            print(article)
            config = Configuration()
            config.browser_user_agent = get_user_agent()
            article_object = Article(article)
            article_object.download()
            if article_object:
                article_object.parse()
                title = article_object.title
                #date = article_object.publish_date
                text = article_object.text
                rv['article'] = title
                rv['pub_date'] = complement
                rv['nltk_score'] = get_nltk_score(text)
                rv['nltk_score_title'] = get_nltk_score(title)
                rv['source'] = 'ProPublica'

        write_csv_pro(articles,
                      'propublica_' + re.sub("/", "_", complement) + '.csv')

    return articles
Example #18
0
    def test_source_build(self):
        """builds a source object, validates it has no errors, prints out
        all valid categories and feed urls"""

        DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
        BRAND = 'cnn'

        configs = Configuration()
        configs.verbose = False
        s = Source('http://cnn.com', configs=configs)
        s.clean_memo_cache()
        s.build()

        assert s.brand == BRAND
        assert s.description == DESC

        print '\t\tWe have %d articles currently!' % s.size()
Example #19
0
    def test_download_works(self):
        config = Configuration()
        config.memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config=config)
        tc_paper = newspaper.build('http://techcrunch.com', config=config)
        espn_paper = newspaper.build('http://espn.com', config=config)

        print ('slate has %d articles tc has %d articles espn has %d articles'
               % (slate_paper.size(), tc_paper.size(), espn_paper.size()))

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
Example #20
0
    def test_download_works(self):
        config = Configuration()
        config.memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config=config)
        tc_paper = newspaper.build('http://techcrunch.com', config=config)
        espn_paper = newspaper.build('http://espn.com', config=config)

        print('Slate has %d articles TC has %d articles ESPN has %d articles' %
              (slate_paper.size(), tc_paper.size(), espn_paper.size()))

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded Slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded ESPN mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded TC mthread len', len(tc_paper.articles[1].html)
def modified_fulltext(parser, language, url):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    url_parsed = urlparse(url)

    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    config = Configuration()
    config.language = language
    config.keep_article_html = True
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)

    doc = parser
    doc = rm_ads(doc,url_parsed.hostname)
    doc = clean(document_cleaner,doc)
    #doc = document_cleaner.clean(doc)
    doc = calculate_best_node(extractor,doc)
    #doc = extractor.calculate_best_node(doc)
    if doc is not None:
        #doc = extractor.add_siblings(doc)
        doc = post_cleanup(doc)
        #doc = extractor.post_cleanup(doc)
        text, html = get_formatted(doc)
        #text, html = output_formatter.get_formatted(doc)
    else:
        text = ''
        html = ''

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'modified',
        }
Example #22
0
def fulltext(html, language='en'):
    """Takes article HTML string input and outputs the fulltext
    Input string is decoded via UnicodeDammit if needed
    """

    config = Configuration()
    config.language = language

    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = WithTagOutputFormatter(config)

    doc = config.get_parser().fromstring(html)
    doc = document_cleaner.clean(doc)

    top_node = extractor.calculate_best_node(doc)

    top_node = extractor.post_cleanup(top_node)
    text, article_html = output_formatter.get_formatted(top_node)
    return text, article_html
Example #23
0
    def test_download_works(self):
        """
        """
        config = Configuration()
        config.is_memoize_articles = False
        slate_paper = newspaper.build('http://slate.com', config)
        tc_paper = newspaper.build('http://techcrunch.com', config)
        espn_paper = newspaper.build('http://espn.com', config)

        print 'slate has %d articles tc has %d articles espn has %d articles' \
                % (slate_paper.size(), tc_paper.size(), espn_paper.size())

        papers = [slate_paper, tc_paper, espn_paper]
        news_pool.set(papers, threads_per_source=2)

        news_pool.join()

        print 'Downloaded slate mthread len', len(slate_paper.articles[0].html)
        print 'Downloaded espn mthread len', len(espn_paper.articles[-1].html)
        print 'Downloaded tc mthread len', len(tc_paper.articles[1].html)
def get_info(dictionary):
    '''
    Get information for all the articles
    for the selected sections in La Jornada
    Inputs:
            Dictionary with selected sections
            as keys and list of urls representing
            articles in every section
    Returns:
            A dictionary with nltk scores for title
            and text for every article in every section
    '''
    rv = {}
    count = 0
    for key, item in dictionary.items():
        for i in item:
            config = Configuration()
            config.browser_user_agent = get_user_agent()
            article = Article(i, language='es')
            article.download()
            if article.is_downloaded == True:
                irv = {}
                rv[count] = irv
                article.parse()
                count = count + 1
                title = article.title
                tr_title = mtranslate.translate(title, "en", "auto")
                #print(title, key, count)
                date = article.publish_date.date()
                text = article.text
                tr_text = translate_article(text)
                #if key not in rv:
                irv['article'] = tr_title
                irv['pub_date'] = date
                irv['nltk_score'] = get_nltk_score(
                    tr_text)  #will be converted into sentiment score
                irv['source'] = 'Jornada'
                irv['nltk_score_title'] = get_nltk_score(tr_title)
                #rv[key].append((title, date, text))
    return rv
Example #25
0
def ProcessArticle(urlStr, domain, htmlStr, cursor):
    config = Configuration()
    extractor = ContentExtractor(config)
    clean_doc = config.get_parser().fromstring(htmlStr)
    title = extractor.get_title(clean_doc)
    authors = extractor.get_authors(clean_doc)
    text = fulltext(htmlStr)

    text_keyws = list(nlp.keywords(text).keys())
    title_keyws = list(nlp.keywords(title).keys())

    keyws = list(set(title_keyws + text_keyws))
    summary_sents = nlp.summarize(title=title,
                                  text=text,
                                  max_sents=config.MAX_SUMMARY_SENT)
    summary = '\n'.join(summary_sents)

    if len(text) == 0:
        OnArticleProcessError(urlStr)
    else:
        StoreToDatabase(urlStr, domain, title, authors, text, keyws, summary,
                        cursor)
    def __init__(self,
                 html: str,
                 url_to_visit: str,
                 scraped_from: str,
                 fake: bool,
                 conf: Optional[Configuration] = None,
                 do_nlp: bool = True):
        super().__init__(url='',
                         config=conf if conf is not None else Configuration())
        super().set_html(html)
        super().parse()

        if do_nlp:
            super().nlp()

        self.fake = fake
        self.url_to_visit: str = url_to_visit
        self.scraped_from: str = scraped_from
        self.soup: BeautifulSoup = utreq.soup_from_response(html)
        self.actual_url: str = url_to_visit
        self.__text_length: Optional[int] = None
        self.scraped_date = datetime.now()
def get_data_from_html(html):
    result = {}
    parsed_html = Parser.fromstring(html)

    config = Configuration()
    extractor = ContentExtractor(config)
    formatter = OutputFormatter(config)
    cleaner = DocumentCleaner(config)

    result['title'] = extractor.get_title(parsed_html)

    publishing_date = extractor.get_publishing_date('', parsed_html)
    if publishing_date is None:
        publishing_date = datetime.datetime.now()

    result['published_at'] = publishing_date.isoformat()

    cleaned_html = cleaner.clean(parsed_html)
    top_node = extractor.calculate_best_node(cleaned_html)
    top_node = extractor.post_cleanup(top_node)
    result['content'], _ = formatter.get_formatted(top_node)

    return result
Example #28
0
BLACKLIST_SUFFIX = [
    '.js',
    '.css',
    '.png',
    '.jpg',
    '.jpeg',
    '.pdf',
    '.ico',
    '.gif',
    '.m4a',
    '.woff2'
]
BLACKLIST_REGEX = [
    'http[s]?://(.*)signout(.*)'
]
NEWSPAPER_CONFIG = Configuration()
NEWSPAPER_CONFIG.fetch_images = False
NEWSPAPER_CONFIG.memoize_articles = False

class BaseCrawler:
    # Crawler Identifier
    crawler_id = 'com.base'

    # Rate limit configuration
    requests_per_sec = 1

    # robots.txt url
    robots_url = None

    # URLs of pages to crawl
    # start from
Example #29
0
 def setUp(self):
     self.extractor = newspaper.extractors.ContentExtractor(Configuration())
     self.parser = newspaper.parsers.Parser
Example #30
0
__author__ = "Vishal Jasrotia"
__copyright__ = ""
__credits__ = ["Vishal Jasrotia"]
__license__ = ""
__version__ = "1.0.0"
__maintainer__ = "Vishal Jasrotia"
__email__ = "*****@*****.**"
__status__ = "Testing"

from newsly.Builder import NewsBuilder
from newspaper.configuration import Configuration

if __name__ == "__main__":
    #pass config = Configuration()
    config = Configuration()
    config.memoize_articles = True  # True in production
    config.MAX_AUTHORS = 2
    config.MIN_WORD_COUNT = 300
    #config.MAX_SUMMARY  = 900 on text .Not on summary . dont use it
    #TODO : Have a separate ArticleConfig and SourceConfig extend this!

    builder = NewsBuilder(config)
    builder.build()
    builder.print_source_vs_article_url()
Example #31
0
class ArticleExtractionPipeline(object):
    def __init__(self):
        self.config = Configuration(
        )  # sets meta config for article and parser
        self.parser = self.config.get_parser()  # parser
        self.extractor = ContentExtractor(
            self.config
        )  # extracts info (author, tags, text, etc.) from parsed article
        self.doc_cleaner = DocumentCleaner(
            self.config)  # cleans unwanted tags and nodes from DOM
        self.formatter = OutputFormatter(
            self.config)  # outputs formatted text from parsed xpath nodes

    # right now basically only works for RT
    # params: doc is parsed html from self.parser
    def find_date_from_html(self, doc):
        # https://github.com/Webhose/article-date-extractor/blob/master/articleDateExtractor/__init__.py
        candidates = self.parser.getElementsByTag(doc, tag="time")  # add more
        times = []
        for candidate in candidates:
            time_string = candidate.text
            for indicator in ["Edited", "Updated", "Published"]:
                if indicator in time_string:
                    # indicator probably followed by "at" or ":", actual time is after that
                    if "at" in time_string:
                        time_string = time_string.split("at", 1)[1]
                    elif ":" in time_string:
                        time_string = time_string.split(":", 1)[1]
                    break
            time = self.datetime_from_str(time_string)
            if time:
                times.append(time)
        if times:
            return min(times)
        else:
            return None

    def datetime_from_str(self, datetime_string):
        try:
            return date_parser.parse(datetime_string).replace(
                tzinfo=None
            )  # otherwise can't compare naive and (timezone) offset-aware times
        except (ValueError, OverflowError, AttributeError, TypeError):
            return None

    # params: doc is parsed html from self.parser
    # TODO: generalize
    def get_date(self, url, doc):
        raw_date = (
            self.extractor.get_publishing_date(url, doc)
            or  # telesur, africanews
            self.extractor.get_meta_content(doc,
                                            "meta[name='LastModifiedDate']")
            or  # aljazeera, Sun, 07 January 2018 18:36:49 GMT
            self.extractor.get_meta_content(doc, "meta[name='Last-Modified']")
            or  # times of india, Jan 9, 2018, 05:18 IST
            self.extractor.get_meta_content(
                doc, "meta[property='og:updated_time']")
        )  # diplomat, "2018-01-05 23:22:46"
        if raw_date:
            return self.datetime_from_str(raw_date)
        else:
            return self.find_date_from_html(doc)

    # params: date is datetime object
    def recent_article(self, date, max_days_elapsed=3):
        return datetime.datetime.now() - date < datetime.timedelta(
            days=max_days_elapsed)

    def process_item(self, item, spider):
        doc = self.parser.fromstring(item["content"])

        item["title"] = self.extractor.get_title(doc)
        item["description"] = self.extractor.get_meta_description(doc)
        item["keywords"] = (self.extractor.get_meta_content(
            doc, "meta[name='news_keywords']")
                            or self.extractor.get_meta_keywords(doc))
        item["date"] = self.get_date(item["url"], doc)

        # drop item if no date
        if not item["date"] or not self.recent_article(
                item["date"], max_days_elapsed=7
        ):  # or not self.recent_article(item["date"])
            raise DropItem("Missing or invalid date for: {}".format(
                item["title"]))

        # clean:
        clean_doc = self.doc_cleaner.clean(doc)
        top_node = self.extractor.post_cleanup(
            self.extractor.calculate_best_node(clean_doc))
        item["content"] = self.formatter.get_formatted(top_node)[
            0]  # [1] returns html of article

        # drop item if article too short
        if len(item["content"]) < 600:
            raise DropItem("Not enough text: {}".format(item["title"]))

        logging.info("ARTICLE TITLE: {}".format(item["title"]))
        logging.info("\t time: {}".format(item["date"]))
        return item
Example #32
0
def modified_fulltext(parser, language):
    '''
    Adapted from https://github.com/codelucas/newspaper/blob/master/newspaper/api.py#L71
    but modified to use an already existing lxml parser
    '''
    from newspaper.cleaners import DocumentCleaner
    from newspaper.configuration import Configuration
    from newspaper.extractors import ContentExtractor
    from newspaper.outputformatters import OutputFormatter

    def calculate_best_node(self, doc):
        top_node = None
        cxpath_body_nodes = lxml.etree.XPath('(//pre)|(//p)|(//td)')
        #nodes_to_check = self.nodes_to_check(doc)
        starting_boost = float(1.0)
        #cnt = 0
        #i = 0
        parent_nodes = []
        nodes_with_text = []

        #for node in nodes_to_check:
        for node in cxpath_body_nodes(doc):
            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(language=self.language). \
                get_stopword_count(text_node)
            high_link_density = self.is_highlink_density(node)
            if word_stats.get_stopword_count() > 2 and not high_link_density:
                nodes_with_text.append(node)

        nodes_number = len(nodes_with_text)
        negative_scoring = 0
        bottom_negativescore_nodes = float(nodes_number) * 0.25

        #for node in nodes_with_text:
        for i,node in enumerate(nodes_with_text):
            boost_score = float(0)
            # boost
            if self.is_boostable(node):
                #if cnt >= 0:
                if i >= 0:
                    boost_score = float((1.0 / starting_boost) * 50)
                    starting_boost += 1
            # nodes_number
            if nodes_number > 15:
                if (nodes_number - i) <= bottom_negativescore_nodes:
                    booster = float(
                        bottom_negativescore_nodes - (nodes_number - i))
                    boost_score = float(-pow(booster, float(2)))
                    negscore = abs(boost_score) + negative_scoring
                    if negscore > 40:
                        boost_score = float(5)

            text_node = self.parser.getText(node)
            word_stats = self.stopwords_class(language=self.language). \
                get_stopword_count(text_node)
            upscore = int(word_stats.get_stopword_count() + boost_score)

            parent_node = self.parser.getParent(node)
            self.update_score(parent_node, upscore)
            self.update_node_count(parent_node, 1)

            if parent_node not in parent_nodes:
                parent_nodes.append(parent_node)

            # Parent of parent node
            parent_parent_node = self.parser.getParent(parent_node)
            if parent_parent_node is not None:
                self.update_node_count(parent_parent_node, 1)
                self.update_score(parent_parent_node, upscore / 2)
                if parent_parent_node not in parent_nodes:
                    parent_nodes.append(parent_parent_node)
            #cnt += 1
            #i += 1

        top_node_score = 0
        for e in parent_nodes:
            score = self.get_score(e)

            if score > top_node_score:
                top_node = e
                top_node_score = score

            if top_node is None:
                top_node = e
        return top_node

    config = Configuration()
    config.language = language
    extractor = ContentExtractor(config)
    document_cleaner = DocumentCleaner(config)
    output_formatter = OutputFormatter(config)
    doc = parser
    #doc = document_cleaner.clean(doc)
    top_node = calculate_best_node(extractor,doc)
    if top_node is not None:
        top_node = extractor.post_cleanup(top_node)
        text, html = output_formatter.get_formatted(top_node)
    else:
        text = None
        html = None

    return {
        'value' : {
            'text' : text,
            'html' : html,
            },
        'pattern' : 'modified',
        }
Example #33
0
from newspaper import NewsPool
from newspaper.configuration import Configuration
from difflib import SequenceMatcher
from urllib.parse import urlparse

ZIPS = 'zips'
blocklisted = ['http://www.legacy.com/']

with open('article_template.json') as file:
    article_template = json.load(file)

alog = open('article_log1.log', 'a')
slog = open('sources_log1.log', 'a')

## TODO do not fetch images
config = Configuration()
config.fetch_images = False


def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()


def scrape_source(source):
    try:
        news_source = newspaper.build(source['url'], config=config)
        store_articles(source, news_source)
    except Exception as e:
        slog.write('\n' + datetime.now().isoformat() + '\t' + str(e))

Example #34
0
def newspaper_config() -> Configuration:
    conf = Configuration()
    conf.MAX_TITLE = 500
    return conf