def test_within_geobox(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) dataset = SmappDataset(['json', file_path]) # geobox here is for us mountain time # i created a coordinate in our data file on the last object [-105.29, 40.33] # i also added one to the json that is outside of us mountain time [-123.007053, 44.824997] count = len([tweet for tweet in dataset.within_geobox(-113.95, 28.81, -100.05, 48.87)]) self.assertEqual(1, count)
def test_base_top_entities_returns_counts(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) returndict = dataset.get_top_entities({'urls':5, 'symbols':3}) if len(returndict['urls']) > 0: self.assertTrue(len(returndict['urls']) == 5) if len(returndict['symbols']) > 0: self.assertTrue(len(returndict['symbols']) == 3)
def test_smapp_dataset_takes_collections_datasets_and_base_input_types(self): file_path_bson = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) file_path_bson_2 = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) file_path_json = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) file_path_csv = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid']) collection = SmappCollection('bson', file_path_bson_2) dataset_1 = SmappDataset(['bson', file_path_bson], ['csv', file_path_csv]) dataset_2 = SmappDataset(dataset_1, ['json', file_path_json], collection) self.assertTrue(len(list(dataset_2)) > 0)
def test_dump_to_sqlite_db(self): if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db') output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.db' dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) dataset.dump_to_sqlite_db(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.db') > 0) if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db')
def test_dump_to_json(self): if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json') output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json') dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) dataset.dump_to_json(output_path) self.assertTrue(os.path.getsize('{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json')) > 0) if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
def test_sample_chains_and_dumps(self): if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json') output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json') collection = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) sample_tweets = collection.sample(10) sample_tweets.dump_to_json(output_path) self.assertTrue(os.path.getsize(output_path) > 0) with open(output_path) as f: self.assertEqual(10, len([line for line in f])) if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
def test_dump_to_csv_parallel(self): if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv') if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv') output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv' dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) dataset.dump_to_csv(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'], num_files=2) self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_0.csv') > 0) self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_1.csv') > 0) if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv') if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv')
def test_dump_to_bson_parallel(self): if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson') if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson') output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.bson' dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']]) dataset.dump_to_bson(output_path, num_files=2) self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_0.bson') > 0) self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_1.bson') > 0) if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson') if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson')
def test_sample_returns_dif_tweets_than_fist_10_tweets(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) sample_tweets = list(dataset.sample(10)) dataset_two = SmappDataset(['bson', file_path]) first_ten_tweets = list(dataset_two.limit_number_of_tweets(10)) self.assertNotEqual(sample_tweets, first_ten_tweets)
def test_set_custom_filter_properly_filters(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset_one = SmappDataset(['bson', file_path]) full_collection_len = len(list(dataset_one)) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len(list(dataset_one.set_custom_filter(is_tweet_a_retweet))) dataset_two = SmappDataset(['bson', file_path]) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len(list(dataset_two.set_custom_filter(is_not_a_retweet))) self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
def test_get_top_hashtags(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) base_hashtags = {'hashtags': {'2a': 26, 'pjnet': 26, 'jadehelm': 111, 'falseflag': 32, 'JadeHelm': 118}} hashtags = dataset.get_top_hashtags(5) self.assertTrue(set(hashtags.keys()) == set(base_hashtags.keys()))
def test_user_id_is(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) dataset = SmappDataset(['json', file_path]) count = len([tweet for tweet in dataset.user_id_is(379851447, 149751818)]) self.assertEqual(77, count)
def test_sample_returns_right_number_of_items(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) sample_collection = dataset.sample(10) self.assertEqual(10, len(list(sample_collection)))
def test_base_top_entities_returns_dict(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) returndict = dataset.get_top_entities({'hashtags':5}) self.assertTrue(isinstance(returndict, dict))
def test_base_top_entities_returns_hashtags_and_media(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) returndict = dataset.get_top_entities({'user_mentions':5, 'media':3}) self.assertTrue('user_mentions' in returndict and 'media' in returndict)
def test_user_description_contains(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) dataset = SmappDataset(['json', file_path]) count = len([tweet for tweet in dataset.user_description_contains('JESUS')]) self.assertEqual(15, count)
def test_get_top_terms(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) top_counts = dataset.get_top_terms(10) base_top_counts = {'Jade': 538, 'Duty:': 146, 'Ops': 265, 'Sevenfold': 216, 'III': 173, 'RT': 524, 'Black': 235, 'Helm': 415, 'Avenged': 220, '-': 193} self.assertTrue(set(top_counts.keys()) == set(base_top_counts.keys()))
def test_limit_number_of_tweets(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) self.assertTrue(len(list(dataset.limit_number_of_tweets(100))) > 0)
def test_smapp_dataset_file_pattern_returns_two_collections(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson') dataset = SmappDataset(['bson','file_pattern',file_path]) self.assertTrue(all([type(collection) == BsonCollection for collection in dataset.collections]))
def test_smapp_dataset_file_pattern_takes_a_unix_pattern(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson') dataset = SmappDataset(['bson', 'file_pattern', file_path]) self.assertTrue(len(list(dataset)) > 0)
def test_place_name_contains_country(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) dataset = SmappDataset(['json', file_path]) count = len([tweet for tweet in dataset.place_name_contains_country('United States')]) self.assertEqual(6, count)
def test_get_non_geo_enabled(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) count = len([tweet for tweet in dataset.get_non_geo_enabled()]) self.assertEqual(1186, count)
def test_get_top_urls(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) urls = dataset.get_top_urls(5) base_urls = {'urls': {'https://t.co/ATzXpRciyr': 18, 'https://t.co/dpz7vZ1JWy': 39, 'https://t.co/l9OEuvRlt8': 24, 'https://t.co/nkc4hnukLX': 21, 'https://t.co/rsNUItS48U': 60}} self.assertTrue(set(urls.keys()) == set(base_urls.keys()))
def test_get_top_mentions(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) top_mentions = dataset.get_top_mentions(5) base_top_mentions = {'user_mentions': {'233498836': 58, '27234909': 56, '10228272': 75, '1619936671': 41, '733417892': 121}} self.assertTrue(set(top_mentions.keys()) == set(base_top_mentions.keys()))
def test_smapp_dataset_file_pattern_takes_home_path(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson') file_path = file_path.replace('/Users/yvanscher', '~') dataset = SmappDataset(['bson','file_pattern',file_path]) self.assertTrue(len(list(dataset)) > 0)
def test_get_top_media(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) top_media = dataset.get_top_media(5) base_top_media = {'media': {'https://t.co/pAfigDPcNc': 27, 'https://t.co/MaOGn6wH40': 17, 'https://t.co/TH8TmGuYww': 24, 'https://t.co/YpqDPqA2UO': 14, 'https://t.co/ORaTXOM2oX': 55}} self.assertTrue(set(top_media.keys()) == set(base_top_media.keys()))
def test_smapp_csv_collection_iterates(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid']) dataset = SmappDataset(['csv', file_path]) self.assertTrue(len(list(dataset)) > 0)
def test_get_top_symbols(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) top_symbols = dataset.get_top_symbols(5) base_top_symbols = {'symbols': {0: None, 'hould': 1, 2: None, 3: None, 1: None}} self.assertTrue(set(top_symbols.keys()) == set(base_top_symbols.keys()))
def get_smapp_mongo_dataset(dataset_name, db_host, db_port, db_user, db_pass): return SmappDataset(['mongo', db_host, db_port, db_user, db_pass], collection_regex='(^data$|^tweets$|^tweets_\d+$)', database_regex='(^' + dataset_name + '$|^' + dataset_name + '_\d+$)')
def test_exclude_retweets(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) dataset = SmappDataset(['bson', file_path]) count = len([tweet for tweet in dataset.exclude_retweets()]) self.assertEqual(682, count)