def make_sqlite_db_json(output, input_file, fields): logger = logging.getLogger(__name__) logger.info('Creating your output file : %s', output) column_str = ','.join([column for column in fields]).replace('.','__') question_marks = ','.join(['?' for column in fields]) con = sqlite3.connect(output) cur = con.cursor() cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str)) json_col = JsonCollection(input_file) insert_list = [] tp = TweetParser() for count,tweet in enumerate(json_col.get_iterator()): ret = tp.parse_columns_from_tweet(tweet, fields) row = [replace_none(col_val[1]) for col_val in ret] insert_list.append(tuple(row)) if (count % 10000) == 0: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() insert_list = [] if count < 10000: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() con.close() logger.info('Finished processing input: {}, output is: {}'.format(input_file, output))
def write_files(collection): hashtags = ["#ivoted", "#myvote2016", "#myvote"] statements = ["i voted", "i will vote", "my vote", "vote for"] with open("/scratch/olympus/projects/hashtag_filtering/hashtags_{}.json".format(collection[0].split('/')[4]), 'w') as hashtag_file: with open("/scratch/olympus/projects/hashtag_filtering/statements_{}.json".format(collection[0].split('/')[4]), 'w') as statement_file: # with open --> this is how to read in / initialize files in python # 'w' : write, 'r' : read for each_file in collection: hashtags_counter = 0 statements_counter = 0 collection = JsonCollection(each_file, throw_error=False, verbose=1) for tweet in collection.get_iterator(): if tweet and tweet["text"]: if any(hashtag in tweet["text"] for hashtag in hashtags): hashtag_file.write("{}\n".format(json.dumps(tweet, default=date_handler))) hashtags_counter += 1 if any(statement in tweet["text"] for statement in statements): statement_file.write("{}\n".format(json.dumps(tweet, default=date_handler))) statements_counter += 1 else: logging.info("Something was wrong with a tweet") logging.info("Extracted {} tweets to the statement file".format(statements_counter)) logging.info("Extracted {} tweets to the hashtags file".format(hashtags_counter)) # same as tweet_counter = tweet_counter + 1 statement_file.close() hashtag_file.close()
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid']) self.maxDiff = None it = collection.strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key,value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue(tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0) self.maxDiff = None it = collection.strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key,value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue(tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))
def test_clean_tweets_on_clean_data(self): self.setUp() clean_tweets( os.path.dirname(os.path.realpath(__file__)) + '/' + config['json']['valid'], os.path.dirname(os.path.abspath(__file__)) + '/../test/output.json', os.path.dirname(os.path.abspath(__file__)) + '/../test/output_err.json') col = JsonCollection( os.path.dirname(os.path.abspath(__file__)) + '/../test/output.json') with open( os.path.dirname(os.path.abspath(__file__)) + '/../test/output.json', 'r') as f: for line in f: try: json.loads(line) except: self.assertTrue(False) excepted = False with open( os.path.dirname(os.path.abspath(__file__)) + '/../test/output_err.json', 'r') as f: for line in f: try: json.loads(line) except: excepted = True self.assertFalse(excepted) self.tearDown()
def test_clean_multiple_files(self): self.setUp() clean_tweets_multiple( os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/dirty*', os.path.dirname(os.path.abspath(__file__)) + '/../test/output.json', os.path.dirname(os.path.abspath(__file__)) + '/../test/output_err.json') col = JsonCollection( os.path.dirname(os.path.abspath(__file__)) + '/../test/output.json') with open( os.path.dirname(os.path.abspath(__file__)) + '/../test/output.json', 'r') as f: for line in f: try: json.loads(line) except: self.assertTrue(False) excepted = False with open( os.path.dirname(os.path.abspath(__file__)) + '/../test/output_err.json', 'r') as f: for line in f: try: json.loads(line) except: excepted = True self.assertTrue(excepted) self.tearDown()
def test_json_collection_custom_filter_filters(self): collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid']) full_collection_len = len(list(collectionone.get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator())) collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid']) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len(list(collectiontwo.set_custom_filter(is_not_a_retweet).get_iterator())) #the numbes of retweets and non retweets should add up to the whole collection self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
def test_dump_to_csv_orders_and_encodes_properly(self): if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv') output_path = os.path.dirname(os.path.realpath(__file__)) + '/' + 'data/output.csv' collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid-single']) collection.dump_to_csv(output_path, ['id_str', 'entities.hashtags.0', 'entities.hashtags.1', 'source', 'user.id', 'timestamp.$date', 'text']) with open(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv', 'rb') as filehandle: count = 0 for line in unicodecsv.reader(filehandle): if count != 0: val_count = 0 for csv_row_value in line: everything_in_order = True if val_count == 0: self.assertEqual(csv_row_value, '661275583813431296') elif val_count == 1: loaded_dict = json_util.loads(csv_row_value) if not all(k in loaded_dict for k in ['text', 'indices']) and loaded_dict['text'] == 'jadehelm' and loaded_dict['indices'] == [74, 83]: everything_in_order = False elif val_count == 2: loaded_dict = json_util.loads(csv_row_value) if not all(k in loaded_dict for k in ['text', 'indices']) and loaded_dict['text'] == 'newworldorder' and loaded_dict['indices'] == [84, 98]: everything_in_order = False elif val_count == 3: self.assertEqual(csv_row_value, '<a href="https://twitter.com/Col_Connaughton" rel="nofollow">Colin\'s Autotweeterpro5.3</a>') elif val_count == 4: self.assertEqual(csv_row_value, '379851447') elif val_count == 5: self.assertEqual(csv_row_value, '1446495359000') elif val_count == 6: self.assertEqual(csv_row_value, 'Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat') if everything_in_order: self.assertTrue(True) val_count += 1 else: count += 1 filehandle.close() if os.path.exists(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv'): os.remove(os.path.dirname(os.path.abspath(__file__))+'/data/output.csv')
def test_json_collection_custom_filter_filters(self): collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0) full_collection_len = len(list(collectionone.get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator())) collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len(list(collectiontwo.set_custom_filter(is_not_a_retweet).get_iterator())) #the numbes of retweets and non retweets should add up to the whole collection self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
def test_iterator_returns_tweets(self): collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid']) self.assertTrue(len(list(collection.get_iterator())) > 0)
def test_dump_to_csv_orders_and_encodes_properly(self): if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv') output_path = os.path.dirname( os.path.realpath(__file__)) + '/' + 'data/output.csv' collection = JsonCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['json']['valid-single']) collection.dump_to_csv(output_path, [ 'id_str', 'entities.hashtags.0.text', 'entities.hashtags.1.text', 'source', 'user.id', 'timestamp.$date', 'text' ]) with open( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv', 'r') as filehandle: count = 0 for line in csv.reader(filehandle): if count != 0: val_count = 0 for csv_row_value in line: everything_in_order = True if val_count == 0: self.assertEqual(csv_row_value, '661275583813431296') elif val_count == 1: if csv_row_value != 'jadehelm': everything_in_order = False elif val_count == 2: if csv_row_value != 'newworldorder': everything_in_order = False elif val_count == 3: self.assertEqual( csv_row_value, '<a href="https://twitter.com/Col_Connaughton" rel="nofollow">Colin\'s Autotweeterpro5.3</a>' ) elif val_count == 4: self.assertEqual(csv_row_value, '379851447') elif val_count == 5: self.assertEqual(csv_row_value, '2015-11-02 20:15:59+00:00') elif val_count == 6: self.assertEqual( csv_row_value, 'Susan Lindauer, Rtd US Army LTC Potter: Jade Helm https://t.co/VA4bQRudLt #jadehelm #newworldorder #usa #tyranny #threat' ) if everything_in_order: self.assertTrue(True) val_count += 1 else: count += 1 if os.path.exists( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv'): os.remove( os.path.dirname(os.path.abspath(__file__)) + '/data/output.csv')
def test_iterator_returns_tweets(self): collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0) self.assertTrue(len(list(collection.get_iterator())) > 0)