def test_user_description_contains(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) collection = SmappCollection('json', file_path) count = len( [tweet for tweet in collection.user_description_contains('JESUS')]) self.assertEqual(15, count)
def test_dump_to_csv(self): if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv') output_path = os.path.dirname( os.path.realpath(__file__)) + '/' + 'data/output.csv' collection = SmappCollection( 'bson', os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) collection.dump_to_csv( output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1']) self.assertTrue(os.path.getsize(output_path) > 0) if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv')
def test_user_id_is(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) collection = SmappCollection('json', file_path) count = len( [tweet for tweet in collection.user_id_is(379851447, 149751818)]) self.assertEqual(77, count)
def test_sample_chains_and_dumps(self): if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson.json'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson.json') output_path = '{}/{}'.format( os.path.dirname(os.path.realpath(__file__)), 'data/output.bson.json') collection = SmappCollection( 'bson', os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) sample_tweets = collection.sample(10) sample_tweets.dump_to_json(output_path) self.assertTrue(os.path.getsize(output_path) > 0) with open(output_path) as f: self.assertEqual(10, len([line for line in f])) if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson.json'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson.json')
def test_detect_tweet_language(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) count = len( [tweet for tweet in collection.detect_tweet_language('en')]) self.assertEqual(907, count)
def test_get_tweets_containing(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) count = len( [tweet for tweet in collection.get_tweets_containing('jade')]) self.assertEqual(167, count)
def test_get_date_range(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) count = len([ tweet for tweet in collection.get_date_range( datetime(2015, 11, 2), datetime(2015, 11, 3)) ]) self.assertEqual(26, count)
def test_place_name_contains_country(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) collection = SmappCollection('json', file_path) count = len([ tweet for tweet in collection.place_name_contains_country( 'United States') ]) self.assertEqual(6, count)
def test_find_date_range(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) range_obj = collection.find_date_range() self.assertEqual(datetime(2015, 11, 2, 19, 56, 33), range_obj['date_min']) self.assertEqual(datetime(2015, 11, 6, 21, 35, 54), range_obj['date_max'])
def test_base_top_entities_returns_counts(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) returndict = collection.get_top_entities({'urls': 5, 'symbols': 3}) if len(returndict['urls']) > 0: self.assertTrue(len(returndict['urls']) == 5) if len(returndict['symbols']) > 0: self.assertTrue(len(returndict['symbols']) == 3)
def test_get_tweet_texts(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) texts = [ text for text in collection.limit_number_of_tweets(1).get_tweet_texts() ] self.assertEqual(str, type(texts[0]))
def test_base_top_entities_returns_hashtags_and_media(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) returndict = collection.get_top_entities({ 'user_mentions': 5, 'media': 3 }) self.assertTrue('user_mentions' in returndict and 'media' in returndict)
def test_within_geobox(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) collection = SmappCollection('json', file_path) # geobox here is for us mountain time # i created a coordinate in our data file on the last object [-105.29, 40.33] # i also added one to the json that is outside of us mountain time [-123.007053, 44.824997] count = len([ tweet for tweet in collection.within_geobox( -113.95, 28.81, -100.05, 48.87) ]) self.assertEqual(1, count)
def test_get_top_urls(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) urls = collection.get_top_urls(5) base_urls = { 'urls': { 'https://t.co/ATzXpRciyr': 18, 'https://t.co/dpz7vZ1JWy': 39, 'https://t.co/l9OEuvRlt8': 24, 'https://t.co/nkc4hnukLX': 21, 'https://t.co/rsNUItS48U': 60 } } self.assertTrue(set(urls.keys()) == set(base_urls.keys()))
def test_get_top_hashtags(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) base_hashtags = { 'hashtags': { '2a': 26, 'pjnet': 26, 'jadehelm': 111, 'falseflag': 32, 'JadeHelm': 118 } } hashtags = collection.get_top_hashtags(5) self.assertTrue(set(hashtags.keys()) == set(base_hashtags.keys()))
def test_get_top_media(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) top_media = collection.get_top_media(5) base_top_media = { 'media': { 'https://t.co/pAfigDPcNc': 27, 'https://t.co/MaOGn6wH40': 17, 'https://t.co/TH8TmGuYww': 24, 'https://t.co/YpqDPqA2UO': 14, 'https://t.co/ORaTXOM2oX': 55 } } self.assertTrue(set(top_media.keys()) == set(base_top_media.keys()))
def test_get_top_symbols(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) top_symbols = collection.get_top_symbols(5) base_top_symbols = { 'symbols': { 0: None, 'hould': 1, 2: None, 3: None, 1: None } } self.assertTrue( set(top_symbols.keys()) == set(base_top_symbols.keys()))
def test_get_top_mentions(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) top_mentions = collection.get_top_mentions(5) base_top_mentions = { 'user_mentions': { '233498836': 58, '27234909': 56, '10228272': 75, '1619936671': 41, '733417892': 121 } } self.assertTrue( set(top_mentions.keys()) == set(base_top_mentions.keys()))
def dump_tweets(filename, retweets, fields=None): collection = SmappCollection('json', filename) collection = collection.user_language_is('en') if fields is None: fields = ['id', 'text', 'timestamp_ms', 'user.id_str'] if retweets: collection = collection.get_retweets() collection.dump_to_csv('/scratch/en919/retw_' + args[1] + '.csv', fields) else: collection = collection.exclude_retweets() collection.dump_to_csv('/scratch/en919/no_retw_' + args[1] + '.csv', fields)
def test_smapp_dataset_takes_collections_datasets_and_base_input_types(self): file_path_bson = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) file_path_bson_2 = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) file_path_json = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) file_path_csv = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['csv']['valid']) collection = SmappCollection('bson', file_path_bson_2) dataset_1 = SmappDataset(['bson', file_path_bson], ['csv', file_path_csv]) dataset_2 = SmappDataset(dataset_1, ['json', file_path_json], collection) self.assertTrue(len(list(dataset_2)) > 0)
def json2csv(f, f_out, cols=cols): ''' Reads a json file into a SmappCollection. Dumps to csv. Removes incompletely dumped csv's from past jobs. gzips csv. ''' collection = SmappCollection('json', f).dump_to_csv(f_out, cols) gzip(f_out)
def test_get_top_terms(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) top_counts = collection.get_top_terms(10) base_top_counts = { 'Jade': 538, 'Duty:': 146, 'Ops': 265, 'Sevenfold': 216, 'III': 173, 'RT': 524, 'Black': 235, 'Helm': 415, 'Avenged': 220, '-': 193 } self.assertTrue(set(top_counts.keys()) == set(base_top_counts.keys()))
def test_set_custom_filter_properly_filters(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection_one = SmappCollection('bson', file_path) full_collection_len = len(list(collection_one)) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len( list(collection_one.set_custom_filter(is_tweet_a_retweet))) collection_two = SmappCollection('bson', file_path) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len( list(collection_two.set_custom_filter(is_not_a_retweet))) self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
def split_json(f, chunksize=120000, remove=True): ''' Splits json into chunksize, and removes old file ''' collection = SmappCollection('json', f) for i, group in enumerate(grouper(collection, chunksize)): f_out = f.replace('.json', '___pt{:03d}.json'.format(i)) if os.path.isfile(f_out): os.remove(f_out) with open(f_out, 'w') as outputfile: json.dump(list(group), outputfile, ensure_ascii=False) if remove: os.remove(f)
def test_sample_returns_dif_tweets_than_fist_10_tweets(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection_one = SmappCollection('bson', file_path) sample_tweets = list(collection_one.sample(10)) collection_two = SmappCollection('bson', file_path) first_ten_tweets = list(collection_two.limit_number_of_tweets(10)) self.assertNotEqual(sample_tweets, first_ten_tweets)
def test_dump_to_bson(self): if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson') output_path = os.path.dirname( os.path.realpath(__file__)) + '/' + 'data/output.bson' collection = SmappCollection( 'bson', os.path.dirname(os.path.realpath(__file__)) + '/' + config['bson']['valid']) collection.dump_to_bson(output_path) self.assertTrue(os.path.getsize(output_path) > 0) if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.bson')
def json2csv(f, cols=cols, keep=True): ''' Reads a json file into a SmappCollection. Dumps to csv. Removes incompletely dumped csv's from past jobs. gzips csv. ''' f_out = bootstrap(f) if not os.path.isfile(f_out): collection = SmappCollection('json', f).dump_to_csv(f_out, cols) if not keep: os.remove(f) split_csv(f_out)
def test_tweet_field_grouped_by_timeslice_years(self): output_path = '{}/chart_tests/Bar-{}-bar.png'.format( os.path.dirname(os.path.realpath(__file__)), datetime.now()) file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['json']['valid']) collection = SmappCollection('json', file_path) def custom_filter(tweet): return True plots.bar_graph_tweet_field_grouped_by_period( collection, '', [], custom_filter, 'years', datetime(2015, 9, 1), datetime(2015, 11, 30), output_path, 'date', 'tweet counts', 'filtered tweets by hour')
def date_filter(output, input_file, dateone, datetwo): #configure logging logger = logging.getLogger(__name__) logger.info('Iterating through your file : %s', output) _, file_extension = splitext(input_file) file_extension = file_extension[1:] #if dateone input exists make a datetime object with it if dateone: startdate = datetime.datetime.strptime(dateone,'%Y-%m-%d %H:%M:%S') #if datetwo input exists make a datetime object with it if datetwo: enddate = datetime.datetime.strptime(datetwo,'%Y-%m-%d %H:%M:%S') #user gave 2 dates and wants a range if dateone and datetwo: logger.info('creating smapp collection and query for dates {} and {}'.format(startdate, enddate)) collection = SmappCollection(file_extension, input_file) collection.get_date_range(startdate, enddate).dump_to_bson(output) #user gave date once and wants objects since elif dateone: enddate = datetime.datetime.now() logger.info('creating smapp collection and query for dates {} and {}'.format(startdate, enddate)) collection = SmappCollection(file_extension, input_file) collection.get_date_range(startdate, enddate).dump_to_bson(output) #user gave date two and wants objects up to that point elif datetwo: startdate = datetime.datetime.min logger.info('creating smapp collection and query for dates {} and {}'.format(startdate, enddate)) collection = SmappCollection(file_extension, input_file) collection.get_date_range(startdate, enddate).dump_to_bson(output) else: logger.info('Couldn\'t find a date, exiting at %s!', datetime.datetime.now().strftime('%Y-%m-%d %H:%M')) logger.info('Finished merging input file : %s', output) logger.info('Finished merging all input files to path : %s', output)
def check_dump_integrity(hostname, port, dbname, username, password, authdb, dump_dir, check_type = COUNT_COLLECTIONS): # connect to the db mongo = pymongo.MongoClient(hostname, int(port)) if username and password: mongo[authdb].authenticate(username, password) db = mongo[dbname] # Get a list of relevant collections from the database db_collections = db.collection_names() db_relevant_collections = [match.group(1) for coll in db_collections for match in [DB_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match] db_relevant_collections.sort() db_relevant_collections.sort(key=len) #Get a list of relevant collections from the dump dump_exists = True dump_path = dump_dir + dbname if dump_dir[-1:] == '/' else dump_dir + '/' + dbname dump_collections = [] try: dump_collections = [file for file in os.listdir(dump_path)] except OSError as e: if e.errno == 2: dump_exists = False else: logging.error(e) print(e) dump_relevant_collections = [match.group(1) for coll in dump_collections for match in [DUMP_RELEVANT_COLLECTIONS_REGEX.search(coll)] if match] dump_relevant_collections.sort() dump_relevant_collections.sort(key=len) #Find out format of dump (.json or .bson) dump_format = 'bson' if dump_relevant_collections[0].split('.', 1)[1] == 'json': dump_format = 'json' #CHECK NUMBER OF COLLECTIONS if check_type == COUNT_COLLECTIONS: num_collections_in_db = len(db_relevant_collections) num_collections_dumped = len(dump_relevant_collections) #Print integrity of number of collections in dump log_output = 'DUMP FOR {} '.format(dbname) if not dump_exists: log_output += 'DOES NOT EXIST. ' elif num_collections_dumped < num_collections_in_db: log_output += 'IS MISSING COLLECTIONS. ' elif num_collections_dumped > num_collections_in_db: log_output += 'HAS TOO MANY COLLECTIONS. ' else: log_output += 'IS OK ON COLLECTIONS. ' log_output += 'Number of collections in database: {}, Number of collections dumped: {}'.format(num_collections_in_db, num_collections_dumped) logging.info('\n' + log_output) print('\n' + log_output) #Print list of any collections missing from dump if dump_exists and (num_collections_dumped < num_collections_in_db): dump_relevant_collections_split = [dump_coll.split('.' + dump_format, 1)[0] for dump_coll in dump_relevant_collections] missing_collections = [coll for coll in db_relevant_collections if coll not in dump_relevant_collections_split] missing_collections.sort() missing_collections.sort(key=len) logging.info('\n' + 'Missing Collections: {}'.format(missing_collections)) print('\n' + 'Missing Collections: {}'.format(missing_collections)) #CHECK NUMBER OF DOCUMENTS elif check_type == COUNT_DOCUMENTS: logger.info('\n' + 'Counting number of documents in {} database'.format(dbname)) print('\n' + 'Counting number of documents in {} database'.format(dbname)) total_documents_in_db = 0 db_collection_doc_counts = {} #Sum total documents in db for coll in db_relevant_collections: num_docs_in_coll = db[coll].count() total_documents_in_db += num_docs_in_coll #Save document count for db collection db_collection_doc_counts[coll] = num_docs_in_coll logging.info("Database {} {} document count: {}".format(dbname, coll, num_docs_in_coll)) print("Database {} {} document count: {}".format(dbname, coll, num_docs_in_coll)) logger.info('\n' + 'Counting number of documents in {} dump'.format(dbname)) print('\n' + 'Counting number of documents in {} dump'.format(dbname)) total_documents_dumped = 0 dump_collection_doc_counts = {} #Sum up total number of documents in dump for coll in dump_relevant_collections: collection = SmappCollection(dump_format, dump_path + '/' + coll) num_docs_in_coll = collection.count_tweets() total_documents_dumped += num_docs_in_coll #Save document count for dump collection dump_collection_doc_counts[coll.split('.' + dump_format, 1)[0]] = num_docs_in_coll #Calculate # and % missing documents for collection num_docs_in_db_coll = db_collection_doc_counts[coll.split('.' + dump_format, 1)[0]] num_docs_missing = num_docs_in_db_coll - num_docs_in_coll percentage_docs_missing = 0 if num_docs_in_db_coll != 0: percentage_docs_missing = (num_docs_missing/num_docs_in_db_coll) * 100 logging.info("Dump {} {} document count: {}".format(dbname, coll, num_docs_in_coll)) print("Dump {0} {1} document count: {2} (Missing {3}, {4:.2f}%)".format(dbname, coll, num_docs_in_coll, num_docs_missing, percentage_docs_missing)) # print("".format()) #Calculate # and % missing documents overall total_docs_missing = total_documents_in_db - total_documents_dumped percentage_total_docs_missing = 0 if total_documents_in_db != 0: percentage_total_docs_missing = (total_docs_missing/total_documents_in_db) * 100 #Print integrity of number of documents in dump log_output = 'DUMP FOR {} '.format(dbname) if not dump_exists: log_output += 'DOES NOT EXIST. ' elif total_documents_dumped < total_documents_in_db: log_output += 'IS MISSING DOCUMENTS. ' elif total_documents_dumped > total_documents_in_db: log_output += 'HAS TOO MANY DOCUMENTS. ' else: log_output += 'IS OK ON DOCUMENTS. ' log_output += 'Total documents in database: {0}, Total documents dumped: {1} (Missing {2}, {3:.2f}%)'.format(total_documents_in_db, total_documents_dumped, total_docs_missing, percentage_total_docs_missing) logging.info('\n' + log_output) print('\n' + log_output) #Print list of any collections from dump missing documents if dump_exists and (total_documents_dumped < total_documents_in_db): collections_missing_docs = [coll for coll, count in db_collection_doc_counts.items() if (coll not in dump_collection_doc_counts or dump_collection_doc_counts[coll] != count)] collections_missing_docs.sort() collections_missing_docs.sort(key=len) logging.info('\n' + 'Collections Missing Documents: {}'.format(collections_missing_docs)) print('\n' + 'Collections Missing Documents: {}'.format(collections_missing_docs))
def date_filter(output, input_file, dateone, datetwo): #configure logging logger = logging.getLogger(__name__) logger.info('Iterating through your file : %s', output) _, file_extension = splitext(input_file) file_extension = file_extension[1:] #if dateone input exists make a datetime object with it if dateone: startdate = datetime.datetime.strptime(dateone, '%Y-%m-%d %H:%M:%S') #if datetwo input exists make a datetime object with it if datetwo: enddate = datetime.datetime.strptime(datetwo, '%Y-%m-%d %H:%M:%S') #user gave 2 dates and wants a range if dateone and datetwo: logger.info( 'creating smapp collection and query for dates {} and {}'.format( startdate, enddate)) collection = SmappCollection(file_extension, input_file) collection.get_date_range(startdate, enddate).dump_to_bson(output) #user gave date once and wants objects since elif dateone: enddate = datetime.datetime.now() logger.info( 'creating smapp collection and query for dates {} and {}'.format( startdate, enddate)) collection = SmappCollection(file_extension, input_file) collection.get_date_range(startdate, enddate).dump_to_bson(output) #user gave date two and wants objects up to that point elif datetwo: startdate = datetime.datetime.min logger.info( 'creating smapp collection and query for dates {} and {}'.format( startdate, enddate)) collection = SmappCollection(file_extension, input_file) collection.get_date_range(startdate, enddate).dump_to_bson(output) else: logger.info('Couldn\'t find a date, exiting at %s!', datetime.datetime.now().strftime('%Y-%m-%d %H:%M')) logger.info('Finished merging input file : %s', output) logger.info('Finished merging all input files to path : %s', output)
def test_count_tweets(self): file_path = '{}/{}'.format(os.path.dirname(os.path.realpath(__file__)), config['bson']['valid']) collection = SmappCollection('bson', file_path) count = collection.count_tweets() self.assertEqual(1187, count)