def test_filter_clusters_already_filtered(filterable_cluster): # Filter our good cluster. filter_clusters() # Add check we can't filter it again. with pytest.raises(AlreadyFiltered): filter_clusters()
def test_filter_clusters_kept(filterable_cluster): # Our cluster gets all its quotes filtered out but one (#0), # and is then kept. filter_clusters() with session_scope() as session: fcluster = session.query(Cluster)\ .filter(Cluster.filtered.is_(True)).one() assert fcluster.size == 1 assert fcluster.quotes.first().sid == 0
def test_filter_clusters_emptied(filterable_cluster): # Modify our cluster to make it bad. with session_scope() as session: quote = session.query(Quote).filter(Quote.sid == 0).one() timestamps = quote.url_timestamps.copy() timestamps[1] = datetime.utcnow() + timedelta(days=81) quote.url_timestamps = timestamps # Check our cluster gets filtered out. filter_clusters() with session_scope() as session: assert session.query(Cluster)\ .filter(Cluster.filtered.is_(True)).count() == 0
def test_filter_clusters_too_long(filterable_cluster): # Modify our cluster to make it too long after quote filtering. with session_scope() as session: cluster = session.query(Cluster).first() # This quote is all good, but is too far from quote sid=0, leading # the cluster span to be too long. quote = Quote(sid=5, string='a string with enough ' 'words and no problems') quote.add_url( Url(timestamp=datetime.utcnow() + timedelta(days=80, hours=1), frequency=2, url_type='M', url='some-url') ) cluster.quotes.append(quote) # Now check our cluster gets filtered out. filter_clusters() with session_scope() as session: assert session.query(Cluster)\ .filter(Cluster.filtered.is_(True)).count() == 0
def filter_memetracker(limit): """Filter MemeTracker data.""" logger.info('Starting filtering of memetracker data') filter_clusters(limit=limit) logger.info('Done filtering memetracker data')