def make_sqlite_db_json(output, input_file, fields): logger = logging.getLogger(__name__) logger.info('Creating your output file : %s', output) column_str = ','.join([column for column in fields]).replace('.','__') question_marks = ','.join(['?' for column in fields]) con = sqlite3.connect(output) cur = con.cursor() cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str)) json_col = JsonCollection(input_file) insert_list = [] tp = TweetParser() for count,tweet in enumerate(json_col.get_iterator()): ret = tp.parse_columns_from_tweet(tweet, fields) row = [replace_none(col_val[1]) for col_val in ret] insert_list.append(tuple(row)) if (count % 10000) == 0: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() insert_list = [] if count < 10000: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() con.close() logger.info('Finished processing input: {}, output is: {}'.format(input_file, output))
def get_iterator(self): tweet_parser = TweetParser() if self.compression == 'bz2': self.mode = binary_mode(self.mode) csv_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding) elif self.compression == 'gzip': self.mode = binary_mode(self.mode) csv_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding) else: csv_handle = open(self.filepath, self.mode, encoding=self.encoding) for count, tweet in enumerate(csv.DictReader(csv_handle)): if self.limit < count + 1 and self.limit != 0: csv_handle.close() return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield dict(tweet) csv_handle.close()
def dump_to_sqlite_db(self, output_db, input_fields, top_level=False): def replace_none(s): if s is None: return 'NULL' return s tweet_parser = TweetParser() column_str = ','.join([column for column in input_fields]).replace('.','__') question_marks = ','.join(['?' for column in input_fields]) con = sqlite3.connect(output_db) cur = con.cursor() cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str)) insert_list = [] # batch insert if more than 10k tweets for count, tweet in enumerate(self.get_iterator()): if top_level: ret = list(zip(input_fields, [tweet.get(field) for field in input_fields])) else: ret = tweet_parser.parse_columns_from_tweet(tweet, input_fields) row = [replace_none(col_val[1]) for col_val in ret] insert_list.append(tuple(row)) if (count % 10000) == 0: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() insert_list = [] if count < 10000: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() con.close()
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) self.maxDiff = None it = collection.strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key,value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue(tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))
def get_iterator(self): tweet_parser = TweetParser() bson_handle = open(self.filepath, 'rb') for count, tweet in enumerate(bson.decode_file_iter(bson_handle)): if self.limit < count + 1 and self.limit != 0: bson_handle.close() return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet bson_handle.close()
def get_iterator(self): tweet_parser = TweetParser() mongo_cursor = self.mongo_collection.find( \ filter=self.filter, \ no_cursor_timeout=False, \ limit=self.limit \ ) for tweet in mongo_cursor: if tweet_parser.tweet_passes_custom_filter_list( self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet mongo_cursor.close()
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = CsvCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['csv']['valid']) self.maxDiff = None it = collection.strip_tweets(['source', 'text', 'id_str']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key, value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue( tweets_have_right_keys(it, [['source'], ['text'], ['id_str']]))
def dump_to_csv(self, output_csv, input_fields, write_header=True, top_level=False, mode='a', encoding='utf-8', compression=None): if compression == 'bz2': mode = binary_mode(mode) filehandle = bz2.open(output_csv, mode) elif compression == 'gzip': mode = binary_mode(mode) filehandle = gzip.open(output_csv, mode) else: filehandle = open(output_csv, mode) writer = csv.writer(filehandle) if write_header: writer.writerow(input_fields) tweet_parser = TweetParser() for tweet in self.get_iterator(): if top_level: ret = list(zip(input_fields, [tweet.get(field) for field in input_fields])) else: ret = tweet_parser.parse_columns_from_tweet(tweet,input_fields) ret_values = [col_val[1] for col_val in ret] writer.writerow(ret_values) filehandle.close()
def get_iterator(self): tweet_parser = TweetParser() if self.compression == 'bz2': self.mode = binary_mode(self.mode) json_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding) elif self.compression == 'gzip': self.mode = binary_mode(self.mode) json_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding) else: json_handle = open(self.filepath, self.mode, encoding=self.encoding) bad_lines = 0 for count, tweet in enumerate(json_handle): if not self.throw_error: try: tweet = json_util.loads(tweet) except: bad_lines += 1 else: tweet = json_util.loads(tweet) if self.limit != 0 and self.limit <= count: return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet if self.verbose: print("{} rows are ok.".format(count - bad_lines)) print("{} rows are corrupt.".format(bad_lines)) json_handle.close()
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = MongoCollection(config['mongo']['host'], config['mongo']['port'], config['mongo']['user'], config['mongo']['password'], config['mongo']['database'], config['mongo']['collection']) self.maxDiff = None it = collection.set_limit(10).strip_tweets( ['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key, value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue( tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))