def sample(json_file, data_dir, number_samples=5, images_dir='images'): """Get sample: first <number_samples> of recipes dataset""" images_path = data_dir + images_dir + '/' new_images_path = data_dir + 'sample-images/' # Loading test and train data full_data = myutils.load_json(data_dir + json_file) # Get first recipes for the sample sample = {} count = 0 for recipe in full_data: if count == number_samples: break sample[recipe] = full_data[recipe] count += 1 print 'Sample: {} recipes'.format(count) myutils.save_json(data_dir + 'sample-{}.json'.format(str(count)), sample) print 'Copying image files...' copy_images(images_path, new_images_path, sample) return sample, new_images_path
def clean_dataset(json_file, data_dir, output_file): """Removes html tags and html entities from the ingredients list of all recipes in <json_file> json_file: file.json with the data data_dir: directory in which the json_file is stored. output_file: output_file in which the pre-processed data will be saved. """ data = myutils.load_json(data_dir + json_file) print 'Loaded {} recipes'.format(len(data)) for recipe in data: raw_ingredients = data[recipe]['ingredients'] final_ingredients = [] for ingredient in raw_ingredients: final_ingredients.append(ingredients_utils.clean_html(ingredient)) data[recipe]['ingredients'] = final_ingredients print raw_ingredients print 'novo:\n', final_ingredients print '\n' # Save pre-processed data myutils.save_json(data_dir + output_file, data) print 'Pre-processed data saved in: {}'.format(data_dir + output_file)
def timestamp2tweet_mapping(fname, outname): """ generate a mapping from a timestamp (str) to a list of tids :param fname: :return: """ ti2tw = dict() c = 0 with codecs.open(fname, 'r', 'utf-8') as csvfile: header_reader = csv.reader(itertools.islice(csvfile, 0, 1), delimiter=',', quotechar='"') for elm in header_reader: header = elm reader = csv.DictReader(itertools.islice(csvfile, 1, None), delimiter=',', quotechar='"', fieldnames=header) for row in reader: ts = row['tweet_time'] tid = row['tweetid'] ti2tw.setdefault(ts, []).append(tid) c += 1 if c %1000 == 0: logging.info('Processed {}'.format(c)) csvfile.close() save_json(outname, ti2tw)
def split_data(json_file, data_dir, images_dir='images', train=0.9, validation_split=0.1, revert=False): """ Split the dataset into train, validation and test train: float value from 0 to 1 (test will be 1.0 - train - validation = test) specifying the amount of data for training and test validation_split: float value from 0 to 1 specifying the amount of data from training for validation. Example: train=0.9, validation_split=0.1 test will be 0.1 of the total data and validation will be 0.1 of the train data. revert: if True merge the folders 'train' and 'test' of images_path. """ random.seed(100) # Random number ids, data = recipes_ids(data_dir + json_file) images_path = data_dir + images_dir + '/' train_path = data_dir + 'train/' test_path = data_dir + 'test/' val_path = data_dir + 'val/' if revert: print 'TODO Reverting...' else: if myutils.directory_exists(train_path) or myutils.directory_exists(test_path): print 'Train or/and test folder already there. Returning...' # Loading test and train data data_train = myutils.load_json(data_dir + 'train.json') data_test = myutils.load_json(data_dir + 'test.json') data_val = myutils.load_json(data_dir + 'validation.json') return train_path, val_path, test_path, data_train, data_val, data_test data_train = {} data_test = {} data_val = {} size_dataset = len(data) samples_train = int(math.ceil(train * size_dataset)) samples_val = int(math.ceil(validation_split * samples_train)) samples_train = samples_train - samples_val samples_test = size_dataset - samples_train - samples_val print 'Total dataset={}, train={}, val={}, test={}'.format(size_dataset, samples_train, samples_val, samples_test) # Shuffle data to get random order of recipes random.shuffle(ids) # Get first samples for training, then validation, and the rest for test for index in range(0, samples_train): id_recipe = ids[index] data_train[id_recipe] = data[id_recipe] data.pop(id_recipe) # Removes the recipe # validation for index in range(samples_train, (samples_train + samples_val)): id_recipe = ids[index] data_val[id_recipe] = data[id_recipe] data.pop(id_recipe) # Removes the recipe data_test = data print 'Split data: {} for training (request={}), {} for validation (request={}),' \ ' and {} for test (request={})'.format(len(data_train), samples_train, len(data_val), samples_val, len(data_test), samples_test) myutils.save_json(data_dir + 'train.json', data_train) myutils.save_json(data_dir + 'test.json', data_test) myutils.save_json(data_dir + 'validation.json', data_val) # print 'Copying image files...' copy_images(images_path, train_path, data_train) copy_images(images_path, test_path, data_test) copy_images(images_path, val_path, data_val) return train_path, val_path, test_path, data_train, data_val, data_test
def remove_duplicates(file_name, path_data='../data/', folder='recipes-ctc'): """ Remove recipes that have (image) duplicates in the dataset.""" path_images = path_data + folder + '/images/' path_json_file = path_data + folder + '/' + file_name path_output_json_file = path_data + folder + '/pre-processed-' + file_name data = myutils.load_json(path_json_file) recipes_to_be_removed = [] ignore_list = [] if len(data) == 0: print 'Error: empty file.' else: print 'Total of {} recipes'.format(len(data)) # Compute all hashes first. recipes_hash = {} for recipe in data: current_file = open(path_images + data[recipe]['file_image']).read() hash_image = hashlib.md5(current_file).hexdigest() size_image = os.path.getsize(path_images + data[recipe]['file_image']) recipes_hash[recipe] = {} recipes_hash[recipe]['hash'] = hash_image recipes_hash[recipe]['size'] = size_image print 'All hashes were computed. :D' # Verifies if there are duplicates count = 0 for dict_index in data: if dict_index in recipes_to_be_removed or dict_index in ignore_list: continue print '{} Checking: {}, URL: {}'.format( count, data[dict_index]['file_image'], data[dict_index]['url']) list_entries = [] list_urls = [] # Compares with all other recipes index_achieved = False for dict_index_search in data: # Iterates till dict_index is achieved while not index_achieved and dict_index_search != dict_index: index_achieved = True continue # Ignores same index if (dict_index_search == dict_index) or (dict_index_search in recipes_to_be_removed)\ or (dict_index_search in ignore_list): continue # Ignore file with different sizes. Maybe we can delete this line, # since there is already the hash comparison if recipes_hash[dict_index]['size'] != recipes_hash[ dict_index_search]['size']: continue if recipes_hash[dict_index]['hash'] == recipes_hash[ dict_index_search]['hash']: print '--- Found duplicate: {}'.format( path_images + data[dict_index_search]['file_image']) list_entries.append(dict_index_search) list_urls.append(data[dict_index_search]['url']) count += 1 if len(list_urls) == 0: continue # User determines which recipe delete for url in list_urls: print url user_input = raw_input( "Which recipe do I remove? (None, ID or list of IDs separated by ,): " ) print 'user input = ', user_input.split(',') if user_input.lower() == 'none': print 'No recipe will be removed' ignore_list.append(dict_index) for id_recipe in list_entries: ignore_list.append(id_recipe) else: ids_to_be_removed = user_input.split(',') for id_recipe in ids_to_be_removed: id_recipe = id_recipe.lstrip().rstrip() recipes_to_be_removed.append(id_recipe) print 'Included id={} to be removed'.format(id_recipe) # Remove recipes remove_recipe(recipes_to_be_removed, data, (path_data + folder + '/')) # Save the new json file without duplicates print 'Saving data...' myutils.save_json(path_output_json_file, data)