Exemple #1
0
    def test_cluster_creates_events(self):
        cluster.conf['min_articles'] = 1
        self.assertEqual(Event.query.count(), 0)

        # Since there are no events yet, we expect at least one to be created.
        articles = self.prepare_articles()
        cluster.cluster(articles)

        self.assertTrue(Event.query.count() > 0)
Exemple #2
0
    def test_cluster_creates_events(self):
        cluster.conf['min_articles'] = 1
        self.assertEqual(Event.query.count(), 0)

        # Since there are no events yet, we expect at least one to be created.
        articles = self.prepare_articles()
        cluster.cluster(articles)

        self.assertTrue(Event.query.count() > 0)
Exemple #3
0
    def test_cluster_creates_stories(self):
        cluster.conf['min_articles'] = 1
        cluster.conf['min_events'] = 1
        self.assertEqual(Story.query.count(), 0)

        articles = self.prepare_articles(type='variety')
        cluster.cluster(articles)

        self.assertTrue(Story.query.count() > 0)
Exemple #4
0
    def test_cluster_creates_stories(self):
        cluster.conf['min_articles'] = 1
        cluster.conf['min_events'] = 1
        self.assertEqual(Story.query.count(), 0)

        articles = self.prepare_articles(type='variety')
        cluster.cluster(articles)

        self.assertTrue(Story.query.count() > 0)
Exemple #5
0
def cluster_articles():
    """
    Clusters articles which have not yet been incorporated into the clustering hierarchy.
    """
    articles = Article.query.filter(Article.node_id == None).all()
    if articles:
        try:
            cluster.cluster(articles)
        except cluster.LockException as e:
            pass
Exemple #6
0
    def test_cluster_minimum_articles(self):
        # Create articles and slice such that we expect one cluster of 2 articles and one of 1 article.
        cluster.conf['min_articles'] = 2
        articles = self.prepare_articles(type='different') + self.prepare_articles(type='different')
        articles = articles[:3]

        # Require a minimum of two articles to create an event.
        cluster.cluster(articles)

        # So we should only have one event.
        self.assertEqual(Event.query.count(), 1)
        self.assertEqual(Event.query.first().members.all(), [articles[0], articles[2]])
Exemple #7
0
    def test_cluster_minimum_articles(self):
        # Create articles and slice such that we expect one cluster of 2 articles and one of 1 article.
        cluster.conf['min_articles'] = 2
        articles = self.prepare_articles(
            type='different') + self.prepare_articles(type='different')
        articles = articles[:3]

        # Require a minimum of two articles to create an event.
        cluster.cluster(articles)

        # So we should only have one event.
        self.assertEqual(Event.query.count(), 1)
        self.assertEqual(Event.query.first().members.all(),
                         [articles[0], articles[2]])
Exemple #8
0
    def test_cluster_updates_stories(self):
        cluster.conf['min_articles'] = 1
        cluster.conf['min_events'] = 1
        cluster.conf['story_threshold'] = 80.0
        raw_articles = self.prepare_articles(type='variety')
        articles = raw_articles[:4]
        new_articles = raw_articles[4:]
        cluster.cluster(articles)

        self.assertEqual(Story.query.count(), 1)
        for story in Story.query.all():
            self.assertEqual(story.members.count(), 1)

        cluster.cluster(new_articles)

        # The number of events should not have changed.
        self.assertEqual(Story.query.count(), 1)

        for story in Story.query.all():
            self.assertEqual(story.members.count(), 4)
Exemple #9
0
    def test_cluster_updates_stories(self):
        cluster.conf['min_articles'] = 1
        cluster.conf['min_events'] = 1
        cluster.conf['story_threshold'] = 80.0
        raw_articles = self.prepare_articles(type='variety')
        articles     = raw_articles[:4]
        new_articles = raw_articles[4:]
        cluster.cluster(articles)

        self.assertEqual(Story.query.count(), 1)
        for story in Story.query.all():
            self.assertEqual(story.members.count(), 1)

        cluster.cluster(new_articles)

        # The number of events should not have changed.
        self.assertEqual(Story.query.count(), 1)

        for story in Story.query.all():
            self.assertEqual(story.members.count(), 4)
Exemple #10
0
    def test_cluster_minimum_articles_deletes(self):
        cluster.conf['min_articles'] = 2
        articles = self.prepare_articles(type='different') + self.prepare_articles(type='different')
        cluster.cluster(articles)

        # We should have two events first.
        self.assertEqual(Event.query.count(), 2)

        # Get one duplicate event.
        new_articles = self.prepare_articles(type='different')
        new_articles = new_articles[:1]

        # Cluster the new duplicate event and raise the min.
        # One cluster should have 3 now and one should have two.
        cluster.conf['min_articles'] = 3
        cluster.cluster(new_articles)

        # One event should have been deleted.
        self.assertEqual(Event.query.count(), 1)
        self.assertEqual(Event.query.first().members.all(), [articles[0], articles[2], new_articles[0]])
Exemple #11
0
    def cluster(articles, threshold=0.7, debug=False):
        """
        Clusters a set of articles
        into existing events (or creates new ones).

        Args:
            | articles (list)       -- the Articles to cluster
            | threshold (float)     -- the similarity threshold for qualifying a cluster
        """
        log = logger('EVENT_CLUSTERING')
        if debug:
            log.setLevel('DEBUG')
        else:
            log.setLevel('ERROR')

        updated_clusters = []
        active_clusters = Event.query.filter_by(active=True).all()
        now = datetime.utcnow()

        for article in articles:
            # Select candidate clusters,
            # i.e. active clusters which share at least one entity with this article.
            a_ents = [entity.slug for entity in article.entities]
            candidate_clusters = []
            for c in active_clusters:
                c_ents = [entity.slug for entity in c.entities]
                if set(c_ents).intersection(a_ents):
                    candidate_clusters.append(c)

            selected_cluster = cluster(article,
                                       candidate_clusters,
                                       threshold=threshold,
                                       logger=log)

            # If no selected cluster was found, then create a new one.
            if not selected_cluster:
                log.debug(
                    'No qualifying clusters found, creating a new cluster.')
                selected_cluster = Event([article])
                db.session.add(selected_cluster)

            updated_clusters.append(selected_cluster)

        for clus in active_clusters:
            # Mark expired clusters inactive.
            if (now - clus.updated_at).days > 3:
                clus.active = False
            else:
                clus.update()

        db.session.commit()
        return updated_clusters
Exemple #12
0
    def test_cluster_minimum_articles_deletes(self):
        cluster.conf['min_articles'] = 2
        articles = self.prepare_articles(
            type='different') + self.prepare_articles(type='different')
        cluster.cluster(articles)

        # We should have two events first.
        self.assertEqual(Event.query.count(), 2)

        # Get one duplicate event.
        new_articles = self.prepare_articles(type='different')
        new_articles = new_articles[:1]

        # Cluster the new duplicate event and raise the min.
        # One cluster should have 3 now and one should have two.
        cluster.conf['min_articles'] = 3
        cluster.cluster(new_articles)

        # One event should have been deleted.
        self.assertEqual(Event.query.count(), 1)
        self.assertEqual(Event.query.first().members.all(),
                         [articles[0], articles[2], new_articles[0]])
Exemple #13
0
    def test_cluster_updates_events(self):
        cluster.conf['min_articles'] = 1
        articles = self.prepare_articles(type='different')
        cluster.cluster(articles)

        self.assertEqual(Event.query.count(), 2)
        for event in Event.query.all():
            self.assertEqual(event.members.count(), 1)

        # These two articles should be identical to the first one.
        new_articles = self.prepare_articles(type='duplicate')
        cluster.cluster(new_articles)

        # The number of events should not have changed.
        self.assertEqual(Event.query.count(), 2)

        # One event should have 3 articles.
        for event in Event.query.all():
            if articles[1] in event.members:
                self.assertEqual(event.members.count(), 1)
            else:
                self.assertEqual(event.members.count(), 3)
Exemple #14
0
    def test_cluster_updates_events(self):
        cluster.conf['min_articles'] = 1
        articles = self.prepare_articles(type='different')
        cluster.cluster(articles)

        self.assertEqual(Event.query.count(), 2)
        for event in Event.query.all():
            self.assertEqual(event.members.count(), 1)

        # These two articles should be identical to the first one.
        new_articles = self.prepare_articles(type='duplicate')
        cluster.cluster(new_articles)

        # The number of events should not have changed.
        self.assertEqual(Event.query.count(), 2)

        # One event should have 3 articles.
        for event in Event.query.all():
            if articles[1] in event.members:
                self.assertEqual(event.members.count(), 1)
            else:
                self.assertEqual(event.members.count(), 3)
Exemple #15
0
    def run(self):
        path = APP['CLUSTERING']['hierarchy_path']
        path = os.path.expanduser(path)

        if os.path.exists(path):
            print('Backing up existing hierarchy...')
            shutil.move(path, path + '.bk')

        # Delete existing events.
        events = Event.query.delete()

        articles = Article.query.filter(Article.node_id != None).all()

        # Reset node associations.
        print('Resetting article-node associations...')
        for article in articles:
            article.node_id = None
        db.session.commit()

        print('Reconstructing the hierarchy...')
        start_time = time()

        total = Article.query.count()

        p = Progress()

        # Cluster articles in batches of 1000, for memory's sake.
        batch_size = 100
        articles, remaining = get_unclustered_articles(batch_size)
        p.print_progress((total - remaining) / (total - 1))
        while articles:
            cluster.cluster(articles, snip=False)
            articles, remaining = get_unclustered_articles(batch_size)
            p.print_progress((total - remaining) / (total - 1))

        elapsed_time = time() - start_time
        print('Clustered {0} articles in {1}.'.format(len(articles),
                                                      elapsed_time))
        print('Reconstruction done!')
Exemple #16
0
    def run(self):
        path = APP['CLUSTERING']['hierarchy_path']
        path = os.path.expanduser(path)

        if os.path.exists(path):
            print('Backing up existing hierarchy...')
            shutil.move(path, path + '.bk')

        # Delete existing events.
        events = Event.query.delete()

        articles = Article.query.filter(Article.node_id != None).all()

        # Reset node associations.
        print('Resetting article-node associations...')
        for article in articles:
            article.node_id = None
        db.session.commit()

        print('Reconstructing the hierarchy...')
        start_time = time()

        total = Article.query.count()

        p = Progress()

        # Cluster articles in batches of 1000, for memory's sake.
        batch_size = 100
        articles, remaining = get_unclustered_articles(batch_size)
        p.print_progress((total-remaining)/(total - 1))
        while articles:
            cluster.cluster(articles, snip=False)
            articles, remaining = get_unclustered_articles(batch_size)
            p.print_progress((total-remaining)/(total - 1))

        elapsed_time = time() - start_time
        print('Clustered {0} articles in {1}.'.format(len(articles), elapsed_time))
        print('Reconstruction done!')
Exemple #17
0
    def test_cluster_freezes_events(self):
        cluster.conf['min_articles'] = 1
        articles = self.prepare_articles(type='different')
        cluster.cluster(articles)

        self.assertEqual(Event.query.count(), 2)

        for event in Event.query.all():
            self.assertTrue(event.active)
            if articles[1] in event.members:
                # Fake things and set the event we expect not to have new articles to be old.
                event.updated_at = event.updated_at - timedelta(days=4)

        # These two articles should be identical to the first one.
        new_articles = self.prepare_articles(type='duplicate')
        cluster.cluster(new_articles)

        # One event gets updated, one doesn't.
        for event in Event.query.all():
            if articles[1] in event.members:
                self.assertFalse(event.active)
            else:
                self.assertTrue(event.active)
Exemple #18
0
    def test_cluster_freezes_events(self):
        cluster.conf['min_articles'] = 1
        articles = self.prepare_articles(type='different')
        cluster.cluster(articles)

        self.assertEqual(Event.query.count(), 2)

        for event in Event.query.all():
            self.assertTrue(event.active)
            if articles[1] in event.members:
                # Fake things and set the event we expect not to have new articles to be old.
                event.updated_at = event.updated_at - timedelta(days=4)

        # These two articles should be identical to the first one.
        new_articles = self.prepare_articles(type='duplicate')
        cluster.cluster(new_articles)

        # One event gets updated, one doesn't.
        for event in Event.query.all():
            if articles[1] in event.members:
                self.assertFalse(event.active)
            else:
                self.assertTrue(event.active)
Exemple #19
0
    def cluster(articles, threshold=0.7, debug=False):
        """
        Clusters a set of articles
        into existing events (or creates new ones).

        Args:
            | articles (list)       -- the Articles to cluster
            | threshold (float)     -- the similarity threshold for qualifying a cluster
        """
        log = logger('EVENT_CLUSTERING')
        if debug:
            log.setLevel('DEBUG')
        else:
            log.setLevel('ERROR')

        updated_clusters = []
        active_clusters = Event.query.filter_by(active=True).all()
        now = datetime.utcnow()

        for article in articles:
            # Select candidate clusters,
            # i.e. active clusters which share at least one entity with this article.
            a_ents = [entity.slug for entity in article.entities]
            candidate_clusters = []
            for c in active_clusters:
                c_ents = [entity.slug for entity in c.entities]
                if set(c_ents).intersection(a_ents):
                    candidate_clusters.append(c)

            selected_cluster = cluster(article, candidate_clusters, threshold=threshold, logger=log)

            # If no selected cluster was found, then create a new one.
            if not selected_cluster:
                log.debug('No qualifying clusters found, creating a new cluster.')
                selected_cluster = Event([article])
                db.session.add(selected_cluster)

            updated_clusters.append(selected_cluster)

        for clus in active_clusters:
            # Mark expired clusters inactive.
            if (now - clus.updated_at).days > 3:
                clus.active = False
            else:
                clus.update()

        db.session.commit()
        return updated_clusters
Exemple #20
0
    def cluster(events, threshold=0.7, debug=False):
        """
        Clusters a set of events
        into existing stories (or creates new ones).

        Args:
            | events (list)         -- the Events to cluster
            | threshold (float)     -- the similarity threshold for qualifying a cluster
            | debug (bool)          -- will log clustering info if True

        Returns:
            | clusters (list)       -- the list of updated clusters
        """
        log = logger('STORY_CLUSTERING')
        if debug:
            log.setLevel('DEBUG')
        else:
            log.setLevel('ERROR')

        updated_clusters = []

        for event in events:
            # Find stories which have some matching entities with this event.
            candidate_clusters = Story.query.filter(Entity.name.in_([entity.name for entity in event.entities])).all()

            # Cluster this event.
            selected_cluster = cluster(event, candidate_clusters, threshold=threshold, logger=log)

            # If no selected cluster was found, then create a new one.
            if not selected_cluster:
                log.debug('No qualifying clusters found, creating a new cluster.')
                selected_cluster = Story([event])
                db.session.add(selected_cluster)

            updated_clusters.append(selected_cluster)

        db.session.commit()
        return updated_clusters
Exemple #21
0
def seed(debug=False):
    # Patch out saving images to S3.
    patcher = patch('argos.util.storage.save_from_url', autospec=True, return_value='https://i.imgur.com/Zf9mXlj.jpg')
    patcher.start()

    seeds = open('manage/core/data/seed.json', 'r')

    sample_images = [
        'https://upload.wikimedia.org/wikipedia/commons/d/d5/Michael_Rogers_-_Herbiers_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/6/6e/Brandenburger_Tor_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/ChicagoAntiGaddafiHopeless.jpg/576px-ChicagoAntiGaddafiHopeless.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Evo_morales_2_year_bolivia_Joel_Alvarez.jpg/640px-Evo_morales_2_year_bolivia_Joel_Alvarez.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/2010_wet_season_cloud_over_colombia.jpg/640px-2010_wet_season_cloud_over_colombia.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/2/27/Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg/640px-Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg'
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/2010-10-23-Demo-Stuttgart21-Befuerworter.png/640px-2010-10-23-Demo-Stuttgart21-Befuerworter.png',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg/640px-51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Tough_return%2C_365.35.jpg/640px-Tough_return%2C_365.35.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/8/83/Saint_Isaac%27s_Cathedral_in_SPB.jpeg/800px-Saint_Isaac%27s_Cathedral_in_SPB.jpeg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Ponta_de_S%C3%A3o_Louren%C3%A7o_north_north_east.jpg/800px-Ponta_de_S%C3%A3o_Louren%C3%A7o_north_north_east.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e9/TU_Bibl_01_DSC1099w.jpg/644px-TU_Bibl_01_DSC1099w.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fe/%D0%9C%D0%B0%D0%BA%D0%B5%D0%B4%D0%BE%D0%BD%D0%B8%D1%83%D0%BC_-_%D0%9A%D1%80%D1%83%D1%88%D0%B5%D0%B2%D0%BE.jpg/800px-%D0%9C%D0%B0%D0%BA%D0%B5%D0%B4%D0%BE%D0%BD%D0%B8%D1%83%D0%BC_-_%D0%9A%D1%80%D1%83%D1%88%D0%B5%D0%B2%D0%BE.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/6/64/Puente_Mong%2C_Ciudad_Ho_Chi_Minh%2C_Vietnam%2C_2013-08-14%2C_DD_01.JPG/800px-Puente_Mong%2C_Ciudad_Ho_Chi_Minh%2C_Vietnam%2C_2013-08-14%2C_DD_01.JPG',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Autignac%2C_H%C3%A9rault_01.jpg/800px-Autignac%2C_H%C3%A9rault_01.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Caesio_teres_in_Fiji_by_Nick_Hobgood.jpg/800px-Caesio_teres_in_Fiji_by_Nick_Hobgood.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Ash_in_Yogyakarta_during_the_2014_eruption_of_Kelud_01.jpg/800px-Ash_in_Yogyakarta_during_the_2014_eruption_of_Kelud_01.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/b/bd/12-07-12-wikimania-wdc-by-RalfR-010.jpg/800px-12-07-12-wikimania-wdc-by-RalfR-010.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Mortagne-sur-Gironde_Civellier_Mayflowers_2013.jpg/800px-Mortagne-sur-Gironde_Civellier_Mayflowers_2013.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/British_Museum_Great_Court%2C_London%2C_UK_-_Diliff.jpg/611px-British_Museum_Great_Court%2C_London%2C_UK_-_Diliff.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Mercedes-Benz_Museum_201312_08_blue_hour.jpg/800px-Mercedes-Benz_Museum_201312_08_blue_hour.jpg'
    ]

    print('Resetting the database...')
    db.reflect()
    db.drop_all()
    db.create_all()

    # Create sources
    print('Creating sources...')
    create_sources()
    num_sources = Source.query.count()
    print('Created {0} sources.'.format(num_sources))

    # Create articles
    entries = json.load(seeds)
    print('Seeding {0} articles...'.format(len(entries)))
    articles = []
    for entry in entries:
        if debug:
            print(json.dumps(entry, sort_keys=True, indent=4))

        feed = Feed.query.filter_by(ext_url=entry['source']).first()

        a = Article(
                ext_url=entry['url'],
                feed=feed,
                source=feed.source,
                html=entry['html'],
                text=entry['text'],
                tags=entry['tags'],
                title=entry['title'],
                created_at = parse(entry['published']),
                updated_at = parse(entry['updated']),
                image=random.choice(sample_images), # fake image
                score=random.random() * 100         # fake score
        )
        articles.append(a)
        db.session.add(a)

        progress_bar(len(articles) / len(entries) * 100)

    print('Creating additional articles...')

    # Collect all appropriate files.
    all_files = []
    for dir, subdir, files in os.walk('manage/core/data/organized_articles'):
        for file in files:
            filepath = os.path.join(dir, file)
            name, ext = os.path.splitext(filepath)
            if ext == '.txt':
                all_files.append((dir, name, filepath))

    # Create articles for appropriate files.
    for dir, name, filepath in all_files:
        category = dir.split('/')[-1]
        f = open(filepath, 'r')
        article = Article(
                text=f.read(),
                title=name.split('/')[-1],
                ext_url='http://fauxurl/',
                source = Source.query.get(1),       # fake source
                image=random.choice(sample_images), # fake image
                score=random.random() * 100         # fake score
        )
        db.session.add(article)
        articles.append(article)
        progress_bar(len(articles)/len(all_files) * 100)

    db.session.commit()

    num_articles = Article.query.count()
    num_concepts = Concept.query.count()
    print('Seeded {0} articles.'.format(num_articles))
    print('Found {0} concepts.'.format(num_concepts))

    print('Clustering articles into events...')
    cluster.cluster(articles)
    num_events = Event.query.count()
    print('Created {0} event clusters.'.format(num_events))

    print('Clustering events into stories...')
    # TO DO
    num_stories = Story.query.count()
    print('Created {0} story clusters.'.format(num_stories))

    patcher.stop()

    print('\n\n==============================================')
    print('From {0} sources, seeded {1} articles, found {2} concepts, created {3} events and {4} stories.'.format(num_sources, num_articles, num_concepts, num_events, num_stories))
    print('==============================================\n\n')

    client = current_app.test_client()
    ctx = current_app.test_request_context()
    ctx.push()
    register_user(email='[email protected]', password='******')
    ctx.pop()
    print('\n\n==============================================')
    print('Created a test user, email is [email protected], password is password')
    print('==============================================\n\n')

    client = Client(
        #client_id=gen_salt(40),
        #client_secret=gen_salt(50),
        client_id='test',
        client_secret='test',
        _redirect_uris='http://localhost:5000/authorized',
        _default_scopes='userinfo',
        _allowed_grant_types='authorization_code refresh_token password',
        user_id=None,
        is_confidential=True # make a confidential client.
    )
    db.session.add(client)
    db.session.commit()
    print('\n\n==============================================')
    print('Created a test client:\nclient id: {0}\nclient secret: {1}'.format(client.client_id, client.client_secret))
    print('==============================================\n\n')