def make_sqlite_db_json(output, input_file, fields): logger = logging.getLogger(__name__) logger.info('Creating your output file : %s', output) column_str = ','.join([column for column in fields]).replace('.','__') question_marks = ','.join(['?' for column in fields]) con = sqlite3.connect(output) cur = con.cursor() cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str)) json_col = JsonCollection(input_file) insert_list = [] tp = TweetParser() for count,tweet in enumerate(json_col.get_iterator()): ret = tp.parse_columns_from_tweet(tweet, fields) row = [replace_none(col_val[1]) for col_val in ret] insert_list.append(tuple(row)) if (count % 10000) == 0: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() insert_list = [] if count < 10000: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() con.close() logger.info('Finished processing input: {}, output is: {}'.format(input_file, output))
def dump_to_sqlite_db(self, output_db, input_fields, top_level=False): def replace_none(s): if s is None: return 'NULL' return s tweet_parser = TweetParser() column_str = ','.join([column for column in input_fields]).replace('.','__') question_marks = ','.join(['?' for column in input_fields]) con = sqlite3.connect(output_db) cur = con.cursor() cur.execute("CREATE TABLE IF NOT EXISTS data ({});".format(column_str)) insert_list = [] # batch insert if more than 10k tweets for count, tweet in enumerate(self.get_iterator()): if top_level: ret = list(zip(input_fields, [tweet.get(field) for field in input_fields])) else: ret = tweet_parser.parse_columns_from_tweet(tweet, input_fields) row = [replace_none(col_val[1]) for col_val in ret] insert_list.append(tuple(row)) if (count % 10000) == 0: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() insert_list = [] if count < 10000: cur.executemany("INSERT INTO data ({}) VALUES ({});".format(column_str, question_marks), insert_list) con.commit() con.close()
def get_iterator(self): tweet_parser = TweetParser() if self.compression == 'bz2': self.mode = binary_mode(self.mode) csv_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding) elif self.compression == 'gzip': self.mode = binary_mode(self.mode) csv_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding) else: csv_handle = open(self.filepath, self.mode, encoding=self.encoding) for count, tweet in enumerate(csv.DictReader(csv_handle)): if self.limit < count + 1 and self.limit != 0: csv_handle.close() return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield dict(tweet) csv_handle.close()
def dump_to_csv(self, output_csv, input_fields): count = 0 tweet_parser = TweetParser() filehandle = open(output_csv, 'wb') writer = unicodecsv.writer(filehandle) expanded_fields = [] expanded_fields_list_keys = [] for field_path in input_fields: fields = field_path.split('.') if fields[-1].isdigit(): expanded_fields_list_keys.append((fields[0:len(fields)-1], fields[len(fields)-1])) if fields[0:len(fields)-1] not in expanded_fields: expanded_fields.append(fields[0:len(fields)-1]) else: expanded_fields.append(fields) for tweet in self.get_iterator(): #use json.loads and not json_util #to get a regular dict tweet = json.loads(json_util.dumps(tweet)) row_to_write = [] flat_tweet_list = [] # flatten each tweet, and put the resulting tuples # in a list for flat_entry in tweet_parser.flatten_dict(tweet): flat_tweet_list.append(flat_entry) # write a header if its the first # tweet if count == 0: writer.writerow(input_fields) count += 1 # if each flattened key path # is a path the user wants add # it to be a row to write for expanded_field in expanded_fields: for tweet_tuple in flat_tweet_list: if tweet_tuple[0] == expanded_field: if isinstance(tweet_tuple[1], list): # for each possible array index for list_key in expanded_fields_list_keys: if list_key[0] == tweet_tuple[0] and int(list_key[1]) < len(tweet_tuple[1]): row_to_write.append(json_util.dumps(tweet_tuple[1][int(list_key[1])])) else: row_to_write.append('None') else: if isinstance(tweet_tuple[1], str): row_to_write.append(tweet_tuple[1].encode('utf-8').decode('utf-8')) else: row_to_write.append(tweet_tuple[1]) #convert each thing to unicode writer.writerow(row_to_write) filehandle.close()
def get_iterator(self): tweet_parser = TweetParser() mongo_cursor = self.mongo_collection.find( \ filter=self.filter, \ no_cursor_timeout=False, \ limit=self.limit \ ) for tweet in mongo_cursor: if tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet mongo_cursor.close()
def get_iterator(self): tweet_parser = TweetParser() bson_handle = open(self.filepath, 'rb') for count, tweet in enumerate(bson.decode_file_iter(bson_handle)): if self.limit < count + 1 and self.limit != 0: bson_handle.close() return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet bson_handle.close()
def get_iterator(self): tweet_parser = TweetParser() bson_handle = open(self.filepath, 'rb') for count, tweet in enumerate(bson.decode_file_iter(bson_handle)): if self.limit < count+1 and self.limit != 0: bson_handle.close() return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet bson_handle.close()
def get_iterator(self): tweet_parser = TweetParser() json_handle = open(self.filepath, 'r') for count, tweet in enumerate(json_handle): tweet = json_util.loads(tweet) if self.limit != 0 and self.limit <= count: return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet json_handle.close()
def get_iterator(self): tweet_parser = TweetParser() mongo_cursor = self.mongo_collection.find( \ filter=self.filter, \ no_cursor_timeout=False, \ limit=self.limit \ ) for tweet in mongo_cursor: if tweet_parser.tweet_passes_custom_filter_list( self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet mongo_cursor.close()
def get_iterator(self): tweet_parser = TweetParser() csv_handle = open(self.filepath, "rb") for count, tweet in enumerate(unicodecsv.DictReader(csv_handle)): if self.limit < count + 1 and self.limit != 0: csv_handle.close() return elif tweet_parser.tweet_passes_filter(self.filter, tweet) and tweet_parser.tweet_passes_custom_filter_list( self.custom_filters, tweet ): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet csv_handle.close()
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = BsonCollection(os.path.dirname(os.path.realpath(__file__)) +'/'+ config['bson']['valid']) self.maxDiff = None it = collection.strip_tweets(['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key,value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue(tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))
def dump_to_csv(self, output_csv, input_fields, write_header=True, top_level=False, mode='a', encoding='utf-8', compression=None): if compression == 'bz2': mode = binary_mode(mode) filehandle = bz2.open(output_csv, mode) elif compression == 'gzip': mode = binary_mode(mode) filehandle = gzip.open(output_csv, mode) else: filehandle = open(output_csv, mode) writer = csv.writer(filehandle) if write_header: writer.writerow(input_fields) tweet_parser = TweetParser() for tweet in self.get_iterator(): if top_level: ret = list(zip(input_fields, [tweet.get(field) for field in input_fields])) else: ret = tweet_parser.parse_columns_from_tweet(tweet,input_fields) ret_values = [col_val[1] for col_val in ret] writer.writerow(ret_values) filehandle.close()
def get_iterator(self): tweet_parser = TweetParser() if self.compression == 'bz2': self.mode = binary_mode(self.mode) json_handle = bz2.open(self.filepath, self.mode, encoding=self.encoding) elif self.compression == 'gzip': self.mode = binary_mode(self.mode) json_handle = gzip.open(self.filepath, self.mode, encoding=self.encoding) else: json_handle = open(self.filepath, self.mode, encoding=self.encoding) bad_lines = 0 for count, tweet in enumerate(json_handle): if not self.throw_error: try: tweet = json_util.loads(tweet) except: bad_lines += 1 else: tweet = json_util.loads(tweet) if self.limit != 0 and self.limit <= count: return elif tweet_parser.tweet_passes_filter(self.filter, tweet) \ and tweet_parser.tweet_passes_custom_filter_list(self.custom_filters, tweet): if self.should_strip: yield tweet_parser.strip_tweet(self.keep_fields, tweet) else: yield tweet if self.verbose: print("{} rows are ok.".format(count - bad_lines)) print("{} rows are corrupt.".format(bad_lines)) json_handle.close()
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = CsvCollection( os.path.dirname(os.path.realpath(__file__)) + '/' + config['csv']['valid']) self.maxDiff = None it = collection.strip_tweets(['source', 'text', 'id_str']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key, value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue( tweets_have_right_keys(it, [['source'], ['text'], ['id_str']]))
def test_strip_tweets_keeps_fields(self): tweet_parser = TweetParser() collection = MongoCollection(config['mongo']['host'], config['mongo']['port'], config['mongo']['user'], config['mongo']['password'], config['mongo']['database'], config['mongo']['collection']) self.maxDiff = None it = collection.set_limit(10).strip_tweets( ['id', 'entities.user_mentions', 'user.profile_image_url_https']).get_iterator() def tweets_have_right_keys(iterator, fields): for tweet in iterator: keys = [key for key, value in tweet_parser.flatten_dict(tweet)] for elem in fields: if elem not in keys: return False return True self.assertTrue( tweets_have_right_keys(it, [['id'], ['entities', 'user_mentions'], ['user', 'profile_image_url_https']]))