def make_sqlite_db_json(output, input_file, fields): logger = logging.getLogger(__name__) logger.info('Creating your output file : %s', output) column_str = ','.join([column for column in fields]).replace('.','__') question_marks = ','.join(['?' for column in fields]) con = sqlite3.connect(output) cur = con.cursor() cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str)) json_col = JsonCollection(input_file) insert_list = [] tp = TweetParser() for count,tweet in enumerate(json_col.get_iterator()): ret = tp.parse_columns_from_tweet(tweet, fields) row = [replace_none(col_val[1]) for col_val in ret] insert_list.append(tuple(row)) if (count % 10000) == 0: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() insert_list = [] if count < 10000: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() con.close() logger.info('Finished processing input: {}, output is: {}'.format(input_file, output))
def write_files(collection): hashtags = ["#ivoted", "#myvote2016", "#myvote"] statements = ["i voted", "i will vote", "my vote", "vote for"] with open("/scratch/olympus/projects/hashtag_filtering/hashtags_{}.json".format(collection[0].split('/')[4]), 'w') as hashtag_file: with open("/scratch/olympus/projects/hashtag_filtering/statements_{}.json".format(collection[0].split('/')[4]), 'w') as statement_file: # with open --> this is how to read in / initialize files in python # 'w' : write, 'r' : read for each_file in collection: hashtags_counter = 0 statements_counter = 0 collection = JsonCollection(each_file, throw_error=False, verbose=1) for tweet in collection.get_iterator(): if tweet and tweet["text"]: if any(hashtag in tweet["text"] for hashtag in hashtags): hashtag_file.write("{}\n".format(json.dumps(tweet, default=date_handler))) hashtags_counter += 1 if any(statement in tweet["text"] for statement in statements): statement_file.write("{}\n".format(json.dumps(tweet, default=date_handler))) statements_counter += 1 else: logging.info("Something was wrong with a tweet") logging.info("Extracted {} tweets to the statement file".format(statements_counter)) logging.info("Extracted {} tweets to the hashtags file".format(hashtags_counter)) # same as tweet_counter = tweet_counter + 1 statement_file.close() hashtag_file.close()
def test_json_collection_custom_filter_filters(self): collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid']) full_collection_len = len(list(collectionone.get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator())) collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid']) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len(list(collectiontwo.set_custom_filter(is_not_a_retweet).get_iterator())) #the numbes of retweets and non retweets should add up to the whole collection self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
def test_json_collection_custom_filter_filters(self): collectionone = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0) full_collection_len = len(list(collectionone.get_iterator())) def is_tweet_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return True else: return False num_retweets = len(list(collectionone.set_custom_filter(is_tweet_a_retweet).get_iterator())) collectiontwo = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0) def is_not_a_retweet(tweet): if 'retweeted' in tweet and tweet['retweeted']: return False else: return True num_non_retweets = len(list(collectiontwo.set_custom_filter(is_not_a_retweet).get_iterator())) #the numbes of retweets and non retweets should add up to the whole collection self.assertEqual(num_retweets + num_non_retweets, full_collection_len)
def test_iterator_returns_tweets(self): collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid']) self.assertTrue(len(list(collection.get_iterator())) > 0)
def test_iterator_returns_tweets(self): collection = JsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['json']['valid'], throw_error=0) self.assertTrue(len(list(collection.get_iterator())) > 0)