Ejemplo n.º 1
0
 def test_within_geobox(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
     dataset = SmappDataset(['json', file_path])
     # geobox here is for us mountain time
     # i created a coordinate in our data file on the last object [-105.29, 40.33]
     # i also added one to the json that is outside of us mountain time [-123.007053, 44.824997]
     count = len([tweet for tweet in dataset.within_geobox(-113.95, 28.81, -100.05, 48.87)])
     self.assertEqual(1, count)
Ejemplo n.º 2
0
 def test_base_top_entities_returns_counts(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     returndict = dataset.get_top_entities({'urls':5, 'symbols':3})
     if len(returndict['urls']) > 0:
         self.assertTrue(len(returndict['urls']) == 5)
     if len(returndict['symbols']) > 0:
         self.assertTrue(len(returndict['symbols']) == 3)
Ejemplo n.º 3
0
 def test_smapp_dataset_takes_collections_datasets_and_base_input_types(self):
     file_path_bson = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     file_path_bson_2 = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     file_path_json = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
     file_path_csv = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid'])
     collection = SmappCollection('bson', file_path_bson_2)
     dataset_1 = SmappDataset(['bson', file_path_bson], ['csv', file_path_csv])
     dataset_2 = SmappDataset(dataset_1,  ['json', file_path_json], collection)
     self.assertTrue(len(list(dataset_2)) > 0)
Ejemplo n.º 4
0
    def test_dump_to_sqlite_db(self):
        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db')

        output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.db'
        dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
        dataset.dump_to_sqlite_db(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'])
        self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.db') > 0)

        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.db'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.db')
Ejemplo n.º 5
0
    def test_dump_to_json(self):
        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')

        output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json')
        dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
        dataset.dump_to_json(output_path)
        self.assertTrue(os.path.getsize('{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json')) > 0)

        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
Ejemplo n.º 6
0
    def test_sample_chains_and_dumps(self):
        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')

        output_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)),'data/output.bson.json')
        collection = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
        sample_tweets = collection.sample(10)
        sample_tweets.dump_to_json(output_path)
        self.assertTrue(os.path.getsize(output_path) > 0)
        with open(output_path) as f:
            self.assertEqual(10, len([line for line in f]))

        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
Ejemplo n.º 7
0
    def test_dump_to_csv_parallel(self):
        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv')
        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv')

        output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv'
        dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
        dataset.dump_to_csv(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1'], num_files=2)
        self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_0.csv') > 0)
        self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_1.csv') > 0)

        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.csv')
        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.csv')
Ejemplo n.º 8
0
    def test_dump_to_bson_parallel(self):
        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson')
        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson')

        output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.bson'
        dataset = SmappDataset(['bson', os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']])
        dataset.dump_to_bson(output_path, num_files=2)
        self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_0.bson') > 0)
        self.assertTrue(os.path.getsize(os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output_1.bson') > 0)

        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_0.bson')
        if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson'):
            os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output_1.bson')
Ejemplo n.º 9
0
 def test_sample_returns_dif_tweets_than_fist_10_tweets(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     sample_tweets = list(dataset.sample(10))
     dataset_two = SmappDataset(['bson', file_path])
     first_ten_tweets = list(dataset_two.limit_number_of_tweets(10))
     self.assertNotEqual(sample_tweets, first_ten_tweets)
Ejemplo n.º 10
0
    def test_set_custom_filter_properly_filters(self):
        file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
        dataset_one = SmappDataset(['bson', file_path])
        full_collection_len = len(list(dataset_one))
        def is_tweet_a_retweet(tweet):
            if 'retweeted' in tweet and tweet['retweeted']:
                return True
            else:
                return False
        num_retweets = len(list(dataset_one.set_custom_filter(is_tweet_a_retweet)))

        dataset_two = SmappDataset(['bson', file_path])
        def is_not_a_retweet(tweet):
            if 'retweeted' in tweet and tweet['retweeted']:
                return False
            else:
                return True
        num_non_retweets = len(list(dataset_two.set_custom_filter(is_not_a_retweet)))
        self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
Ejemplo n.º 11
0
 def test_get_top_hashtags(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     base_hashtags = {'hashtags': {'2a': 26, 'pjnet': 26, 'jadehelm': 111, 'falseflag': 32, 'JadeHelm': 118}}
     hashtags = dataset.get_top_hashtags(5)
     self.assertTrue(set(hashtags.keys()) == set(base_hashtags.keys()))
Ejemplo n.º 12
0
 def test_user_id_is(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
     dataset = SmappDataset(['json', file_path])
     count = len([tweet for tweet in dataset.user_id_is(379851447, 149751818)])
     self.assertEqual(77, count)
Ejemplo n.º 13
0
 def test_sample_returns_right_number_of_items(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     sample_collection = dataset.sample(10)
     self.assertEqual(10, len(list(sample_collection)))
Ejemplo n.º 14
0
 def test_base_top_entities_returns_dict(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     returndict = dataset.get_top_entities({'hashtags':5})
     self.assertTrue(isinstance(returndict, dict))
Ejemplo n.º 15
0
 def test_base_top_entities_returns_hashtags_and_media(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     returndict = dataset.get_top_entities({'user_mentions':5, 'media':3})
     self.assertTrue('user_mentions' in returndict and 'media' in returndict)
Ejemplo n.º 16
0
 def test_user_description_contains(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
     dataset = SmappDataset(['json', file_path])
     count = len([tweet for tweet in dataset.user_description_contains('JESUS')])
     self.assertEqual(15, count)
Ejemplo n.º 17
0
 def test_get_top_terms(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     top_counts = dataset.get_top_terms(10)
     base_top_counts = {'Jade': 538, 'Duty:': 146, 'Ops': 265, 'Sevenfold': 216, 'III': 173, 'RT': 524, 'Black': 235, 'Helm': 415, 'Avenged': 220, '-': 193}
     self.assertTrue(set(top_counts.keys()) == set(base_top_counts.keys()))
Ejemplo n.º 18
0
 def test_limit_number_of_tweets(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     self.assertTrue(len(list(dataset.limit_number_of_tweets(100))) > 0)
Ejemplo n.º 19
0
 def test_smapp_dataset_file_pattern_returns_two_collections(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson')
     dataset = SmappDataset(['bson','file_pattern',file_path])
     self.assertTrue(all([type(collection) == BsonCollection for collection in dataset.collections]))
Ejemplo n.º 20
0
 def test_smapp_dataset_file_pattern_takes_a_unix_pattern(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson')
     dataset = SmappDataset(['bson', 'file_pattern', file_path])
     self.assertTrue(len(list(dataset)) > 0)
Ejemplo n.º 21
0
 def test_place_name_contains_country(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid'])
     dataset = SmappDataset(['json', file_path])
     count = len([tweet for tweet in dataset.place_name_contains_country('United States')])
     self.assertEqual(6, count)
Ejemplo n.º 22
0
 def test_get_non_geo_enabled(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     count = len([tweet for tweet in dataset.get_non_geo_enabled()])
     self.assertEqual(1186, count)
Ejemplo n.º 23
0
 def test_get_top_urls(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     urls = dataset.get_top_urls(5)
     base_urls = {'urls': {'https://t.co/ATzXpRciyr': 18, 'https://t.co/dpz7vZ1JWy': 39, 'https://t.co/l9OEuvRlt8': 24, 'https://t.co/nkc4hnukLX': 21, 'https://t.co/rsNUItS48U': 60}}
     self.assertTrue(set(urls.keys()) == set(base_urls.keys()))
Ejemplo n.º 24
0
 def test_get_top_mentions(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     top_mentions = dataset.get_top_mentions(5)
     base_top_mentions = {'user_mentions': {'233498836': 58, '27234909': 56, '10228272': 75, '1619936671': 41, '733417892': 121}}
     self.assertTrue(set(top_mentions.keys()) == set(base_top_mentions.keys()))
Ejemplo n.º 25
0
 def test_smapp_dataset_file_pattern_takes_home_path(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), 'data/val*.bson')
     file_path = file_path.replace('/Users/yvanscher', '~')
     dataset = SmappDataset(['bson','file_pattern',file_path])
     self.assertTrue(len(list(dataset)) > 0)
Ejemplo n.º 26
0
 def test_get_top_media(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     top_media = dataset.get_top_media(5)
     base_top_media = {'media': {'https://t.co/pAfigDPcNc': 27, 'https://t.co/MaOGn6wH40': 17, 'https://t.co/TH8TmGuYww': 24, 'https://t.co/YpqDPqA2UO': 14, 'https://t.co/ORaTXOM2oX': 55}}
     self.assertTrue(set(top_media.keys()) == set(base_top_media.keys()))
Ejemplo n.º 27
0
 def test_smapp_csv_collection_iterates(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid'])
     dataset = SmappDataset(['csv', file_path])
     self.assertTrue(len(list(dataset)) > 0)
Ejemplo n.º 28
0
 def test_get_top_symbols(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     top_symbols = dataset.get_top_symbols(5)
     base_top_symbols = {'symbols': {0: None, 'hould': 1, 2: None, 3: None, 1: None}}
     self.assertTrue(set(top_symbols.keys()) == set(base_top_symbols.keys()))
Ejemplo n.º 29
0
def get_smapp_mongo_dataset(dataset_name, db_host, db_port, db_user, db_pass):
	return SmappDataset(['mongo', db_host, db_port, db_user, db_pass], collection_regex='(^data$|^tweets$|^tweets_\d+$)', database_regex='(^' + dataset_name + '$|^' + dataset_name + '_\d+$)')
Ejemplo n.º 30
0
 def test_exclude_retweets(self):
     file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid'])
     dataset = SmappDataset(['bson', file_path])
     count = len([tweet for tweet in dataset.exclude_retweets()])
     self.assertEqual(682, count)