Ejemplo n.º 1
0
    def test_summary_sentences(self):
        # Check to see that we can break up the summary
        # back into its original sentences.

        from argos.core.brain import summarizer
        title = 'Syria Misses New Deadline as It Works to Purge Arms'
        text = 'Syria missed a revised deadline on Sunday for completing the export or destruction of chemicals in its weapons arsenal, but the government of the war-ravaged country may be only days away from finishing the job, according to international experts overseeing the process. The Syrian government had agreed to complete the export or destruction of about 1,200 tons of chemical agents by April 27 after missing a February deadline, but by Sunday, it had shipped out or destroyed 92.5 percent of the arsenal, said Sigrid Kaag, the coordinator of the joint mission by the United Nations and the watchdog agency the Organization for the Prohibition of Chemical Weapons.'
        expected_sents = summarizer.summarize(title, text)

        source = Source()
        source.name = 'Super Cool Times'

        article = Article(
                        title=title,
                        text=text,
                        score=100)
        article.source = source
        article.ext_url = 'http://foo.com'

        self.event=Event([article])

        expected = [{
            'sentence': sent,
            'source': 'Super Cool Times',
            'url': 'http://foo.com'
        } for sent in expected_sents]

        self.assertEqual(self.event.summary_sentences, expected)
Ejemplo n.º 2
0
def article(num=1, **kwargs):
    args = [{
        'title': 'Dinosaurs',
        'text': 'Dinosaurs are cool, Clinton',
        'score': 100
    }, {
        'title': 'Robots',
        'text': 'Robots are nice, Clinton',
        'score': 100
    }, {
        'title': 'Mudcrabs',
        'text': 'Mudcrabs are everywhere, Clinton',
        'score': 100
    }]

    if not kwargs:
        a_s = [Article(**args[i]) for i in range(num)]
    else:
        a_s = [Article(**kwargs) for i in range(num)]

    save(a_s)

    if len(a_s) is 1:
        return a_s[0]
    return a_s
Ejemplo n.º 3
0
    def test_summary_sentences(self):
        # Check to see that we can break up the summary
        # back into its original sentences.

        from argos.core.brain import summarizer
        title = 'Syria Misses New Deadline as It Works to Purge Arms'
        text = 'Syria missed a revised deadline on Sunday for completing the export or destruction of chemicals in its weapons arsenal, but the government of the war-ravaged country may be only days away from finishing the job, according to international experts overseeing the process. The Syrian government had agreed to complete the export or destruction of about 1,200 tons of chemical agents by April 27 after missing a February deadline, but by Sunday, it had shipped out or destroyed 92.5 percent of the arsenal, said Sigrid Kaag, the coordinator of the joint mission by the United Nations and the watchdog agency the Organization for the Prohibition of Chemical Weapons.'
        expected_sents = summarizer.summarize(title, text)

        source = Source()
        source.name = 'Super Cool Times'

        article = Article(title=title, text=text, score=100)
        article.source = source
        article.ext_url = 'http://foo.com'

        self.event = Event([article])

        expected = [{
            'sentence': sent,
            'source': 'Super Cool Times',
            'url': 'http://foo.com'
        } for sent in expected_sents]

        self.assertEqual(self.event.summary_sentences, expected)
Ejemplo n.º 4
0
 def test_event_does_not_cluster_not_similar(self):
     self.prepare_event()
     article = Article(title='Superstars',
                       text='superstars are awesome, Clinton',
                       created_at=datetime.utcnow())
     Event.cluster([article])
     self.assertEqual(len(self.cluster.members), 2)
Ejemplo n.º 5
0
    def test_event_no_matching_cluster_creates_new_cluster(self):
        article = Article(title='Superstars',
                          text='superstars are awesome, Clinton',
                          created_at=datetime.utcnow())
        Event.cluster([article])

        self.assertEqual(Event.query.count(), 1)
Ejemplo n.º 6
0
 def test_event_timespan(self):
     text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs'
     members = [
         Article(title='A',
                 text=text,
                 created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)),
         Article(title='B',
                 text=text,
                 created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)),
         Article(title='C',
                 text=text,
                 created_at=datetime(2014, 1, 24, 1, 1, 1, 111111))
     ]
     self.cluster = Event(members)
     results = self.cluster.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111))
     self.assertEqual(len(results), 2)
     self.assertEqual({r.title for r in results}, {'B', 'C'})
Ejemplo n.º 7
0
 def test_event_entitize(self):
     members = [
         Article(title='Robots', text='dinosaurs are cool, Reagan'),
         self.prepare_articles()[0]
     ]
     self.cluster = Event(members)
     entities = {ent.name for ent in self.cluster.entities}
     self.assertEqual(entities, {'Clinton', 'Reagan'})
Ejemplo n.º 8
0
    def test_collect_ignores_existing(self):
        self.mock_articles.return_value = [
            Article(title='Foo', published=datetime.utcnow(), url='foo.com')
        ]

        collector.collect()
        collector.collect()

        self.assertEquals(Article.query.count(), 1)
Ejemplo n.º 9
0
 def patch_articles(self):
     self.mock_articles = self.create_patch(
         'argos.core.membrane.collector.get_articles')
     self.mock_articles.return_value = [
         Article(title='Foo',
                 created_at=datetime.utcnow(),
                 ext_url='foo.com',
                 text='dinosaurs are cool, Clinton',
                 source=self.source)
     ]
Ejemplo n.º 10
0
    def test_event_does_not_cluster_if_no_shared_entities(self):
        self.prepare_event()
        members = [
            Article(title='Robots',
                    text='dinosaurs are cool, Reagan',
                    created_at=datetime.utcnow())
        ]
        self.cluster.members = members

        Event.cluster([self.article])
        self.assertEqual(len(self.cluster.members), 1)
Ejemplo n.º 11
0
    def prepare_articles(self, type='standard', score=100):
        a = {
            'title': 'Dinosaurs',
            'text': 'dinosaurs are cool, Clinton',
            'score': score
        }
        b = {
            'title': 'Robots',
            'text': 'robots are nice, Clinton',
            'score': score
        }
        c = {
            'title': 'Robots',
            'text': 'papa was a rodeo, Clinton',
            'score': score
        }

        if type == 'standard':
            articles = [Article(**a), Article(**b)]
        elif type == 'duplicate':
            articles = [Article(**a), Article(**a)]
        elif type == 'different':
            articles = [Article(**a), Article(**c)]

        # Need to save these articles to persist concepts,
        # so that their overlaps are calculated properly when clustering!
        for article in articles:
            self.db.session.add(article)
        self.db.session.commit()

        return articles
Ejemplo n.º 12
0
    def test_collect(self):
        self.mock_articles.return_value = [
            Article(title='Foo', published=datetime.utcnow(), url='foo.com')
        ]

        self.assertEquals(Article.query.count(), 0)

        collector.collect()

        self.assertEquals(Article.query.count(), 1)

        article = Article.query.first()
        self.assertEquals(article.title, 'Foo')
Ejemplo n.º 13
0
    def test_conceptize(self):
        members = [
            Article(title='Robots', text='dinosaurs are cool, Reagan'),
            self.prepare_articles()[0]
        ]
        self.event = Event(members)

        concepts = {con.slug for con in self.event.concepts}
        mentions = {ali.name for ali in self.event.mentions}

        self.assertEqual(concepts, {'Clinton', 'Reagan'})
        self.assertEqual(mentions, {'Clinton', 'Reagan'})

        # Each concept's score won't be 0.5, since
        # they are weighed down by the commonness.
        for concept in self.event.concepts:
            self.assertAlmostEqual(concept.score, 0.005, places=3)
Ejemplo n.º 14
0
def evaluate_clustering():
    """
    Evaluate the clustering algorithm.
    """

    logger.info('Constructing expected clusters and articles...')
    expected_clusters = {}
    articles = []
    all_files = []

    # Collect all appropriate files.
    for dir, subdir, files in os.walk('manage/evaluate/organized_articles'):
        for file in files:
            filepath = os.path.join(dir, file)
            name, ext = os.path.splitext(filepath)
            if ext == '.txt':
                all_files.append((dir, name, filepath))

    # Create articles for appropriate files.
    for dir, name, filepath in all_files:
        category = dir.split('/')[-1]
        f = open(filepath, 'r')
        article = Article(text=f.read(), title=name.split('/')[-1])
        expected_clusters.setdefault(category, []).append(article)
        articles.append(article)
        progress_bar(len(articles) / len(all_files) * 100)
    print('\n')

    logger.info('Will cluster {0} articles.'.format(len(articles)))
    logger.info('Expecting {0} clusters.'.format(len(
        expected_clusters.keys())))

    logger.info('Clustering...')
    p = cProfile.Profile()
    clusters = p.runcall(Event.cluster, articles, threshold=0.04, debug=True)

    logger.info('Created {0} clusters.'.format(len(clusters)))

    logger.info('Cluster composition is as follows...')
    for c in clusters:
        logger.info([m.title for m in c.members])

    logger.info('Profiling statistics from the clustering...')
    ps = pstats.Stats(p)
    ps.sort_stats('time').print_stats(10)
Ejemplo n.º 15
0
    def test_conceptize_doesnt_set_new_alias_for_existing_concept_with_same_name(self):
        concept = fac.concept()
        uri = concept.uri

        concepts_count = Concept.query.count()
        self.assertEqual(len(concept.aliases), 1)

        # Mock things so we extract one concept with the same URI
        # as the one we just created.
        self.create_patch('argos.core.knowledge.uri_for_name', return_value=uri)
        self.create_patch('galaxy.concepts', return_value=[concept.aliases[0].name])

        # Create the article, which calls conceptize.
        article = Article(title='A title', text='Some text', score=100)

        self.assertEqual(Concept.query.count(), concepts_count)
        self.assertEqual(len(concept.aliases), 1)

        # There should be a mention for each concept on the article.
        self.assertEqual(len(article.concepts), len(article.mentions))
Ejemplo n.º 16
0
    def test_conceptize_adds_new_mention_for_existing_concept_when_mentioned_name_is_different(self):
        concept = fac.concept()
        uri = concept.uri

        # Mock things so two concepts are returned, but since they share the same uri, they point to the same concept.
        self.create_patch('argos.core.knowledge.uri_for_name', return_value=uri)
        self.create_patch('galaxy.concepts', return_value=['Concept alias one', 'Concept alias two'])

        # Create the article, which calls conceptize.
        article = Article(title='A title', text='Some text', score=100)

        # There should still only be one concept on the article.
        self.assertEqual(len(article.concepts), 1)

        # But there should be two mentions.
        self.assertEqual(len(article.mentions), 2)

        # There should only be one concept.
        self.assertEqual(Concept.query.count(), 1)
        # But three aliases (the original one the
        # concept had, plus the two new ones here).
        self.assertEqual(len(concept.aliases), 3)
Ejemplo n.º 17
0
    def test_conceptize_scores_related_concepts(self):
        # Set things up so two concepts are found
        # when processing the article.
        def mock_uri_for_name(name):
            if name == 'some concept':
                return 'uri_a'
            else:
                return 'uri_b'
        mock_func = self.create_patch('argos.core.knowledge.uri_for_name')
        mock_func.side_effect = mock_uri_for_name

        # One concept appears 3 times, the other only once.
        self.create_patch('galaxy.concepts', return_value=['some concept', 'another concept', 'some concept', 'some concept'])

        # Create the article, which calls conceptize.
        article = Article(title='A title', text='Some text', score=100)

        concepts = article.concepts
        for concept in concepts:
            if concept.uri == 'uri_a':
                self.assertEqual(concept.score, 0.75)
            else:
                self.assertEqual(concept.score, 0.25)
Ejemplo n.º 18
0
    def prepare_articles(self, type='standard'):
        a = {
            'title': 'Dinosaurs',
            'text': 'dinosaurs are cool, Clinton',
            'score': 100
        }
        b = {
            'title': 'Robots',
            'text': 'robots are nice, Clinton',
            'score': 100
        }
        c = {
            'title': 'Robots',
            'text': 'papa was a rodeo, Reagan',
            'score': 100
        }
        d = {
            'title': 'Turtle',
            'text': 'teenage mutant ninja turtles',
            'score': 100
        }
        e = {
            'title': 'Parrot',
            'text': 'yo lots of parrots in here',
            'score': 100
        }

        if type == 'standard':
            articles = [Article(**a), Article(**b)]
        elif type == 'duplicate':
            articles = [Article(**a), Article(**a)]
        elif type == 'different':
            articles = [Article(**a), Article(**c)]
        elif type == 'variety':
            articles = [
                Article(**a),
                Article(**a),
                Article(**b),
                Article(**b),
                Article(**c),
                Article(**c),
                Article(**d),
                Article(**d),
                Article(**e),
                Article(**e)
            ]

        # Need to save these articles to persist concepts,
        # so that their overlaps are calculated properly when clustering!
        for article in articles:
            self.db.session.add(article)
        self.db.session.commit()

        return articles
Ejemplo n.º 19
0
    def run(self, dumppath, use_patch):
        if use_patch:
            print('Patching out saving images to S3...')
            patcher = patch('argos.util.storage.save_from_url', autospec=True, return_value='https://i.imgur.com/Zf9mXlj.jpg')
            patcher.start()
        else:
            patcher = None

        print('Loading sources...')
        sources_map = {}
        with open(os.path.join(dumppath, 'sources.json'), 'r') as f:
            sources = json.load(f)
            for i, s in enumerate(sources):
                source = Source.query.filter(Source.name == s['name']).first()
                if not source:
                    source = Source(name=s['name'])
                    db.session.add(source)
                id = s['_id']['$oid']
                sources_map[id] = source

                progress_bar(i/(len(sources) - 1) * 100)

        db.session.commit()

        print('\nLoading feeds...')
        feeds_map = {}
        with open(os.path.join(dumppath, 'feeds.json'), 'r') as f:
            feeds = json.load(f)
            for i, f in enumerate(feeds):
                feed = Feed.query.filter(Feed.ext_url == f['ext_url']).first()
                if not feed:
                    feed = Feed(ext_url=f['ext_url'])
                    db.session.add(feed)
                feed.source = sources_map[f['source']['$oid']]

                id = f['_id']['$oid']
                feeds_map[id] = feed

                progress_bar(i/(len(feeds) - 1) * 100)

        db.session.commit()

        print('\nLoading articles...')
        with open(os.path.join(dumppath, 'articles.json'), 'r') as f:
            articles = json.load(f)
            for i, a in enumerate(articles):

                authors = []
                for author in a['authors']:
                    authors.append(Author.find_or_create(name=author))

                existing = Article.query.filter(Article.ext_url == a['ext_url']).first()

                if not existing:
                    feed = feeds_map[a['feed']['$oid']]
                    article = Article(
                        ext_url=a['ext_url'],
                        source=feed.source,
                        feed=feed,
                        html=None,    # not saved by argos.corpora
                        text=fix_text_segment(a['text']),
                        authors=authors,
                        tags=[],
                        title=fix_text_segment(a['title']),
                        created_at=datetime.fromtimestamp(a['created_at']['$date']/1000),
                        updated_at=datetime.fromtimestamp(a['updated_at']['$date']/1000),
                        image=a['image'],
                        score=evaluator.score(a['ext_url'])
                    )
                    db.session.add(article)
                progress_bar(i/(len(articles) - 1) * 100)

        print('Loaded {0} sources, {1} feeds, and {2} articles.'.format(len(sources), len(feeds), len(articles)))
        print('Done!')

        if patcher is not None:
            patcher.stop()
Ejemplo n.º 20
0
Archivo: feed.py Proyecto: keho98/argos
def articles(source):
    """
    Parse a feed from the specified source,
    gathering the latest articles.

    The minimum length of an entry is
    500 characters. Anything under will be ignored.

    This will silently skip articles for which the full text
    can't be retrieved (i.e. if it returns 404).

    Some feeds, for whatever reason, do not include a `published`
    date in their entry data. In which case, it is left as an
    empty string.

    Args:
        | source (Source)    -- the source to fetch from.

    Returns:
        | list -- list of processed latest articles (as dicts).
    """
    # Fetch the feed data.
    data = feedparser.parse(source.ext_url)

    # If the `bozo` value is anything
    # but 0, there was an error parsing (or connecting) to the feed.
    if data.bozo:
        # Some errors are ok.
        if not isinstance(
                data.bozo_exception,
                feedparser.CharacterEncodingOverride) and not isinstance(
                    data.bozo_exception, feedparser.NonXMLContentType):
            raise data.bozo_exception

    # Build the entry dicts.
    articles = []
    for entry in data.entries:

        # URL for this entry.
        url = entry['links'][0]['href']

        # Complete HTML content for this entry.
        try:
            entry_data, html = extract_entry_data(url)
            full_text = entry_data.cleaned_text
        except (error.HTTPError, error.URLError) as e:
            if type(e) == error.URLError or e.code == 404:
                continue
            else:
                raise

        # Skip over entries that are too short.
        if len(full_text) < 400:
            continue

        url = entry_data.canonical_link or url
        published = parse(entry.get('published')) or entry_data.publish_date
        updated = parse(entry.get('updated')) or published
        title = entry.get('title', entry_data.title)

        # Download and save the top article image.
        image_url = extract_image(entry_data,
                                  filename=hash(url),
                                  save_dir='data/images/')

        articles.append(
            Article(ext_url=url,
                    source=source,
                    html=html,
                    text=full_text,
                    authors=extract_authors(entry),
                    tags=extract_tags(entry, known_tags=entry_data.tags),
                    title=title,
                    created_at=published,
                    updated_at=updated,
                    image=image_url))

    return articles
Ejemplo n.º 21
0
def seed(debug=False):
    # Patch out saving images to S3.
    patcher = patch('argos.util.storage.save_from_url', autospec=True, return_value='https://i.imgur.com/Zf9mXlj.jpg')
    patcher.start()

    seeds = open('manage/core/data/seed.json', 'r')

    sample_images = [
        'https://upload.wikimedia.org/wikipedia/commons/d/d5/Michael_Rogers_-_Herbiers_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/6/6e/Brandenburger_Tor_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/ChicagoAntiGaddafiHopeless.jpg/576px-ChicagoAntiGaddafiHopeless.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Evo_morales_2_year_bolivia_Joel_Alvarez.jpg/640px-Evo_morales_2_year_bolivia_Joel_Alvarez.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/2010_wet_season_cloud_over_colombia.jpg/640px-2010_wet_season_cloud_over_colombia.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/2/27/Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg/640px-Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg'
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/2010-10-23-Demo-Stuttgart21-Befuerworter.png/640px-2010-10-23-Demo-Stuttgart21-Befuerworter.png',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg/640px-51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Tough_return%2C_365.35.jpg/640px-Tough_return%2C_365.35.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/8/83/Saint_Isaac%27s_Cathedral_in_SPB.jpeg/800px-Saint_Isaac%27s_Cathedral_in_SPB.jpeg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Ponta_de_S%C3%A3o_Louren%C3%A7o_north_north_east.jpg/800px-Ponta_de_S%C3%A3o_Louren%C3%A7o_north_north_east.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e9/TU_Bibl_01_DSC1099w.jpg/644px-TU_Bibl_01_DSC1099w.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fe/%D0%9C%D0%B0%D0%BA%D0%B5%D0%B4%D0%BE%D0%BD%D0%B8%D1%83%D0%BC_-_%D0%9A%D1%80%D1%83%D1%88%D0%B5%D0%B2%D0%BE.jpg/800px-%D0%9C%D0%B0%D0%BA%D0%B5%D0%B4%D0%BE%D0%BD%D0%B8%D1%83%D0%BC_-_%D0%9A%D1%80%D1%83%D1%88%D0%B5%D0%B2%D0%BE.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/6/64/Puente_Mong%2C_Ciudad_Ho_Chi_Minh%2C_Vietnam%2C_2013-08-14%2C_DD_01.JPG/800px-Puente_Mong%2C_Ciudad_Ho_Chi_Minh%2C_Vietnam%2C_2013-08-14%2C_DD_01.JPG',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Autignac%2C_H%C3%A9rault_01.jpg/800px-Autignac%2C_H%C3%A9rault_01.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Caesio_teres_in_Fiji_by_Nick_Hobgood.jpg/800px-Caesio_teres_in_Fiji_by_Nick_Hobgood.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Ash_in_Yogyakarta_during_the_2014_eruption_of_Kelud_01.jpg/800px-Ash_in_Yogyakarta_during_the_2014_eruption_of_Kelud_01.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/b/bd/12-07-12-wikimania-wdc-by-RalfR-010.jpg/800px-12-07-12-wikimania-wdc-by-RalfR-010.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Mortagne-sur-Gironde_Civellier_Mayflowers_2013.jpg/800px-Mortagne-sur-Gironde_Civellier_Mayflowers_2013.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/British_Museum_Great_Court%2C_London%2C_UK_-_Diliff.jpg/611px-British_Museum_Great_Court%2C_London%2C_UK_-_Diliff.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Mercedes-Benz_Museum_201312_08_blue_hour.jpg/800px-Mercedes-Benz_Museum_201312_08_blue_hour.jpg'
    ]

    print('Resetting the database...')
    db.reflect()
    db.drop_all()
    db.create_all()

    # Create sources
    print('Creating sources...')
    create_sources()
    num_sources = Source.query.count()
    print('Created {0} sources.'.format(num_sources))

    # Create articles
    entries = json.load(seeds)
    print('Seeding {0} articles...'.format(len(entries)))
    articles = []
    for entry in entries:
        if debug:
            print(json.dumps(entry, sort_keys=True, indent=4))

        feed = Feed.query.filter_by(ext_url=entry['source']).first()

        a = Article(
                ext_url=entry['url'],
                feed=feed,
                source=feed.source,
                html=entry['html'],
                text=entry['text'],
                tags=entry['tags'],
                title=entry['title'],
                created_at = parse(entry['published']),
                updated_at = parse(entry['updated']),
                image=random.choice(sample_images), # fake image
                score=random.random() * 100         # fake score
        )
        articles.append(a)
        db.session.add(a)

        progress_bar(len(articles) / len(entries) * 100)

    print('Creating additional articles...')

    # Collect all appropriate files.
    all_files = []
    for dir, subdir, files in os.walk('manage/core/data/organized_articles'):
        for file in files:
            filepath = os.path.join(dir, file)
            name, ext = os.path.splitext(filepath)
            if ext == '.txt':
                all_files.append((dir, name, filepath))

    # Create articles for appropriate files.
    for dir, name, filepath in all_files:
        category = dir.split('/')[-1]
        f = open(filepath, 'r')
        article = Article(
                text=f.read(),
                title=name.split('/')[-1],
                ext_url='http://fauxurl/',
                source = Source.query.get(1),       # fake source
                image=random.choice(sample_images), # fake image
                score=random.random() * 100         # fake score
        )
        db.session.add(article)
        articles.append(article)
        progress_bar(len(articles)/len(all_files) * 100)

    db.session.commit()

    num_articles = Article.query.count()
    num_concepts = Concept.query.count()
    print('Seeded {0} articles.'.format(num_articles))
    print('Found {0} concepts.'.format(num_concepts))

    print('Clustering articles into events...')
    cluster.cluster(articles)
    num_events = Event.query.count()
    print('Created {0} event clusters.'.format(num_events))

    print('Clustering events into stories...')
    # TO DO
    num_stories = Story.query.count()
    print('Created {0} story clusters.'.format(num_stories))

    patcher.stop()

    print('\n\n==============================================')
    print('From {0} sources, seeded {1} articles, found {2} concepts, created {3} events and {4} stories.'.format(num_sources, num_articles, num_concepts, num_events, num_stories))
    print('==============================================\n\n')

    client = current_app.test_client()
    ctx = current_app.test_request_context()
    ctx.push()
    register_user(email='[email protected]', password='******')
    ctx.pop()
    print('\n\n==============================================')
    print('Created a test user, email is [email protected], password is password')
    print('==============================================\n\n')

    client = Client(
        #client_id=gen_salt(40),
        #client_secret=gen_salt(50),
        client_id='test',
        client_secret='test',
        _redirect_uris='http://localhost:5000/authorized',
        _default_scopes='userinfo',
        _allowed_grant_types='authorization_code refresh_token password',
        user_id=None,
        is_confidential=True # make a confidential client.
    )
    db.session.add(client)
    db.session.commit()
    print('\n\n==============================================')
    print('Created a test client:\nclient id: {0}\nclient secret: {1}'.format(client.client_id, client.client_secret))
    print('==============================================\n\n')
Ejemplo n.º 22
0
def seed(debug=False):
    this_dir = os.path.dirname(__file__)
    seeds = open(os.path.join(this_dir, 'seed.json'), 'r')
    sources = open(os.path.join(this_dir, 'seed_sources.json'), 'r')

    sample_images = [
        'https://upload.wikimedia.org/wikipedia/commons/d/d5/Michael_Rogers_-_Herbiers_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/6/6e/Brandenburger_Tor_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/ChicagoAntiGaddafiHopeless.jpg/576px-ChicagoAntiGaddafiHopeless.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Evo_morales_2_year_bolivia_Joel_Alvarez.jpg/640px-Evo_morales_2_year_bolivia_Joel_Alvarez.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/2010_wet_season_cloud_over_colombia.jpg/640px-2010_wet_season_cloud_over_colombia.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/2/27/Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg/640px-Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg'
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/2010-10-23-Demo-Stuttgart21-Befuerworter.png/640px-2010-10-23-Demo-Stuttgart21-Befuerworter.png',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg/640px-51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Tough_return%2C_365.35.jpg/640px-Tough_return%2C_365.35.jpg'
    ]

    print('Resetting the database...')
    db.drop_all()
    db.create_all()

    # Create sources
    print('Creating sources...')
    for url in json.load(sources):
        s = Source(ext_url=url, name='The Times') # fake name
        db.session.add(s)
    db.session.commit()
    num_sources = Source.query.count()
    print('Created {0} sources.'.format(num_sources))

    # Create articles
    entries = json.load(seeds)
    print('Seeding {0} articles...'.format(len(entries)))
    articles = []
    for entry in entries:
        if debug:
            print(json.dumps(entry, sort_keys=True, indent=4))

        source = Source.query.filter_by(ext_url=entry['source']).first()

        a = Article(
                ext_url=entry['url'],
                source=source,
                html=entry['html'],
                text=entry['text'],
                tags=entry['tags'],
                title=entry['title'],
                created_at = parse(entry['published']),
                updated_at = parse(entry['updated']),
                image=random.choice(sample_images) # fake image
        )
        articles.append(a)
        db.session.add(a)

        progress_bar(len(articles) / len(entries) * 100)

    db.session.commit()

    num_articles = Article.query.count()
    num_entities = Entity.query.count()
    print('Seeded {0} articles.'.format(num_articles))
    print('Found {0} entities.'.format(num_entities))

    print('Clustering articles into events...')
    Event.cluster(articles, threshold=0.02, debug=True)
    num_events = Event.query.count()
    print('Created {0} event clusters.'.format(num_events))

    print('Clustering events into stories...')
    events = Event.query.all()
    Story.cluster(events, threshold=0.02, debug=True)
    num_stories = Story.query.count()
    print('Created {0} story clusters.'.format(num_stories))

    print('\n\n==============================================')
    print('From {0} sources, seeded {1} articles, found {2} entities, created {3} events and {4} stories.'.format(num_sources, num_articles, num_entities, num_events, num_stories))
    print('==============================================\n\n')
Ejemplo n.º 23
0
 def test_event_titleize(self):
     members = [Article(title='Robots', text='dinosaurs are cool, Reagan')
                ] + self.prepare_articles(type='duplicate')
     self.cluster = Event(members)
     self.assertEqual(self.cluster.title, 'Dinosaurs')
Ejemplo n.º 24
0
def get_articles(feed, fn):
    """
    Parse the specified feed,
    gathering the latest new articles.

    If an article matches one that already exists,
    it is skipped.

    The minimum length of an entry is
    500 characters. Anything under will be ignored.

    This will silently skip articles for which the full text
    can't be retrieved (i.e. if it returns 404).

    Some feeds, for whatever reason, do not include a `published`
    date in their entry data. In which case, it is left as an
    empty string.

    Args:
        | feed (Feed)    -- the feed to fetch from.
        | fn (Callable)  -- function to use an article
    """
    # Fetch the feed data.
    data = feedparser.parse(feed.ext_url)

    # If the `bozo` value is anything
    # but 0, there was an error parsing (or connecting) to the feed.
    if data.bozo:
        # Some errors are ok.
        if not isinstance(
                data.bozo_exception,
                feedparser.CharacterEncodingOverride) and not isinstance(
                    data.bozo_exception, feedparser.NonXMLContentType):
            raise data.bozo_exception

    for entry in data.entries:

        # URL for this entry.
        url = entry['links'][0]['href']

        # Check for an existing Article.
        # If one exists, skip.
        if Article.query.filter_by(ext_url=url).count():
            continue

        # Complete HTML content for this entry.
        try:
            entry_data, html = extractor.extract_entry_data(url)
        except (error.HTTPError, error.URLError, ConnectionResetError,
                BadStatusLine) as e:
            if type(e) == error.URLError or e.code == 404:
                # Can't reach, skip.
                logger.exception(
                    'Error extracting data for url {0}'.format(url))
                continue
            else:
                # Just skip so things don't break!
                logger.exception(
                    'Error extracting data for url {0}'.format(url))
                continue

        if entry_data is None:
            continue

        full_text = entry_data.cleaned_text

        # Skip over entries that are too short.
        if len(full_text) < 400:
            continue

        url = entry_data.canonical_link or url
        published = parse(entry.get('published')) if entry.get(
            'published') else entry_data.publish_date
        updated = parse(
            entry.get('updated')) if entry.get('updated') else published
        title = entry.get('title', entry_data.title)

        # Secondary check for an existing Article,
        # by checking the title and source.
        existing = Article.query.filter_by(title=title).first()
        if existing and existing.source == feed.source:
            continue

        # Download and save the top article image.
        image_url = extractor.extract_image(entry_data, filename=hash(url))

        fn(
            Article(ext_url=url,
                    source=feed.source,
                    feed=feed,
                    html=html,
                    text=fix_text_segment(full_text),
                    authors=extractor.extract_authors(entry),
                    tags=extractor.extract_tags(entry,
                                                known_tags=entry_data.tags),
                    title=fix_text_segment(title),
                    created_at=published,
                    updated_at=updated,
                    image=image_url,
                    score=evaluator.score(url)))