def test_dump_to_sqlite_db_dumps(self): if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db') field_list = [ 'id_str', 'coordinates.coordinates.0', 'coordinates.coordinates.1', 'user.id_str', 'user.lang', 'lang', 'text', 'user.screen_name', 'user.location', 'user.description', 'created_at', 'user.friends_count', 'user.followers_count', 'retweet_count', 'entities.urls.0.expanded_url', 'entities.urls.1.expanded_url', 'entities.urls.2.expanded_url', 'entities.urls.3.expanded_url', 'entities.urls.4.expanded_url', 'entities.hashtags.0.text', 'entities.hashtags.1.text' ] output_path = os.path.dirname( os.path.realpath(__file__)) + '/' + 'data/output.db' collection = BsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) collection.dump_to_sqlite_db(output_path, field_list) self.assertTrue(os.path.getsize(output_path) > 0) if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db')
def test_limit_actually_limits(self): collection = BsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) collection.get_iterator() count = len( list(tweet for tweet in collection.set_limit(5).get_iterator())) self.assertEqual(5, count)
def test_strip_tweets_strips_many_tweets_totally(self): collectionone = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) iterator = collectionone.strip_tweets([]).get_iterator() first_tweet = next(iterator) second_tweet = next(iterator) #exhaust the iterator len(list(iterator)) self.assertTrue(first_tweet == {} and second_tweet == {})
def test_set_custom_filter_is_not_double_set(self): collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False collection.set_custom_filter(is_tweet_a_retweet) self.assertFalse(len(collection.custom_filters) > 1)
def test_strip_tweets_strips_many_tweets_totally(self): collectionone = BsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) iterator = collectionone.strip_tweets([]).get_iterator() first_tweet = next(iterator) second_tweet = next(iterator) #exhaust the iterator len(list(iterator)) self.assertTrue(first_tweet == {} and second_tweet == {})
def test_dump_to_json_dumps(self): if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json') output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.bson.json' collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) collection.dump_to_json(output_path) self.assertTrue(os.path.getsize(output_path) > 0) if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.bson.json')
def test_dump_to_csv_dumps(self): if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv') output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv' collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) collection.dump_to_csv(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) self.assertTrue(os.path.getsize(output_path) > 0) if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv')
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) self.maxDiff = None it = collection.strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key,value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue(tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))
def test_filter_is_set(self): collection = BsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) collection.set_filter({'a': 'b', 'c': 'd', 'e': {'f': 'g', 'h': 'i'}}) self.assertEqual(collection.filter, { 'a': 'b', 'c': 'd', 'e': { 'f': 'g', 'h': 'i' } })
def test_set_custom_filter_is_not_double_set(self): collection = BsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False collection.set_custom_filter(is_tweet_a_retweet) self.assertFalse(len(collection.custom_filters) > 1)
def test_collection_filters_custom_filter_filters_something(self): collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) long_len = len(list(collection.get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False collection.set_custom_filter(is_tweet_a_retweet) shorter_len = len(list(collection.get_iterator())) #there should be fewer retweets than all tweets. self.assertTrue(long_len > shorter_len)
def test_dump_to_sqlite_db_dumps_the_right_stuff(self): if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db') field_list = [ 'id_str', 'coordinates.coordinates.0', 'coordinates.coordinates.1', 'user.id_str', 'user.lang', 'lang', 'text', 'user.screen_name', 'user.location', 'user.description', 'created_at', 'user.friends_count', 'user.followers_count', 'retweet_count', 'entities.urls.0.expanded_url', 'entities.urls.1.expanded_url', 'entities.urls.2.expanded_url', 'entities.urls.3.expanded_url', 'entities.urls.4.expanded_url', 'entities.hashtags.0.text', 'entities.hashtags.1.text' ] output_path = os.path.dirname( os.path.realpath(__file__)) + '/' + 'data/output.db' collection = BsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) collection.dump_to_sqlite_db(output_path, field_list) con = sqlite3.connect(output_path) cur = con.cursor() row = [ elem for row in cur.execute("SELECT * FROM data LIMIT 1;") for elem in row ] con.close() self.assertTrue(len(row) > 0) self.assertEqual( set(row), set([ '661275583813431296', 'NULL', 'NULL', '379851447', 'en', 'de', 'Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat', 'Col_Connaughton', 'London UK', '#gaza #palestine #israel #BDS MAD EVIL ISRAEL MURDERS BABIES CIVILIANS to STEAL PALESTINIAN LAND RESOURCES with USA UK HELP. To stop my tweets, BLOCK or MUTE me', 'Mon Nov 02 20:15:59 +0000 2015', 2019, 3159, 0, 'https://www.youtube.com/watch?v=0nJqymxVpwc', 'NULL', 'NULL', 'NULL', 'NULL', 'jadehelm', 'newworldorder' ])) if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.db')
def test_collection_filters_custom_filter_properly_applies_filter(self): collectionone = BsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) full_collection_len = len(list(collectionone.get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len( list( collectionone.set_custom_filter( is_tweet_a_retweet).get_iterator())) collectiontwo = BsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len( list( collectiontwo.set_custom_filter( is_not_a_retweet).get_iterator())) #the numbes of retweets and non retweets should add up to the whole collection self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
def test_bson_collection_custom_filter_filters(self): collectionone = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) full_collection_len = len(list(collectionone.get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator())) collectiontwo = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len(list(collectiontwo.set_custom_filter(is_not_a_retweet).get_iterator())) #the numbes of retweets and non retweets should add up to the whole collection self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
def test_limit_is_set(self): collection = BsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) collection.set_limit(5) self.assertEqual(5, collection.limit) collection.set_limit(0)
def test_dump_to_json_dumps(self): if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson.json'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson.json') output_path = os.path.dirname( os.path.realpath(__file__)) + '/' + 'data/output.bson.json' collection = BsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) collection.dump_to_json(output_path) self.assertTrue(os.path.getsize(output_path) > 0) if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson.json'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson.json')
def test_collection_filters_custom_filter_filters_something(self): collection = BsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) long_len = len(list(collection.get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False collection.set_custom_filter(is_tweet_a_retweet) shorter_len = len(list(collection.get_iterator())) #there should be fewer retweets than all tweets. self.assertTrue(long_len > shorter_len)
def test_iterator_returns_tweets(self): collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) self.assertTrue(len(list(collection.get_iterator())) > 0)
def test_limit_is_set(self): collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) collection.set_limit(5) self.assertEqual(5, collection.limit) collection.set_limit(0)
def test_limit_actually_limits(self): collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) collection.get_iterator() count = len(list(tweet for tweet in collection.set_limit(5).get_iterator())) self.assertEqual(5, count)
def test_filter_is_set(self): collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) collection.set_filter({'a':'b', 'c':'d', 'e':{'f':'g', 'h':'i'}}) self.assertEqual(collection.filter, {'a':'b', 'c':'d', 'e':{'f':'g', 'h':'i'}})