def read_cached_ingredients_words( file='../data/full-recipes-dataset/pre-processed-full-recipes-dataset-v2.json', file_words='../data/words-teste.txt'): """ Returns a list with all words from all ingredients from all recipes of the dataset.""" print 'Reading ingredients of all recipes' config = myutils.load_json(file) ingr_word_list = [] if not os.path.exists(file_words): with open(file_words, 'w') as f: for recipe in config.keys(): for ingredient in config[recipe]["ingredients"]: # ingredient = ingredients_utils.clean_html(ingredient) ingredient = ingredients_utils.clean(ingredient) ingredient = ingredients_utils.clean_recipes_terms( ingredient) word_list = ingredient.split() for w in word_list: if len(w) == 1: # Removing words with just one letter. continue f.write(w + '\n') print 'Saving words...' with open(file_words, 'r') as f: ingr_word_list = [line.rstrip('\n') for line in f] return ingr_word_list
def sample(json_file, data_dir, number_samples=5, images_dir='images'): """Get sample: first <number_samples> of recipes dataset""" images_path = data_dir + images_dir + '/' new_images_path = data_dir + 'sample-images/' # Loading test and train data full_data = myutils.load_json(data_dir + json_file) # Get first recipes for the sample sample = {} count = 0 for recipe in full_data: if count == number_samples: break sample[recipe] = full_data[recipe] count += 1 print 'Sample: {} recipes'.format(count) myutils.save_json(data_dir + 'sample-{}.json'.format(str(count)), sample) print 'Copying image files...' copy_images(images_path, new_images_path, sample) return sample, new_images_path
def clean_dataset(json_file, data_dir, output_file): """Removes html tags and html entities from the ingredients list of all recipes in <json_file> json_file: file.json with the data data_dir: directory in which the json_file is stored. output_file: output_file in which the pre-processed data will be saved. """ data = myutils.load_json(data_dir + json_file) print 'Loaded {} recipes'.format(len(data)) for recipe in data: raw_ingredients = data[recipe]['ingredients'] final_ingredients = [] for ingredient in raw_ingredients: final_ingredients.append(ingredients_utils.clean_html(ingredient)) data[recipe]['ingredients'] = final_ingredients print raw_ingredients print 'novo:\n', final_ingredients print '\n' # Save pre-processed data myutils.save_json(data_dir + output_file, data) print 'Pre-processed data saved in: {}'.format(data_dir + output_file)
def recipes_ids(json_file): """Get the ids of all recipes in a list. Format of Json file: each recipe is a dictionary.""" data = myutils.load_json(json_file) ids = [] for id_recipe in data: ids.append(id_recipe) return ids, data
def fig_ingredients_per_recipe(json_file, file_ingredients, values=None, image_file='ingredients_per_recipe.png'): """Returns number of recipes(images) per number of ingredients. Example: we have 1000 recipes that have 2 ingredients... """ list_of_all_ingredients = load_all_ingredients(file_ingredients) x_values = np.arange(1, len(list_of_all_ingredients) + 1) if values is None: print 'Loading data...' data = load_json(json_file) recipes_per_ingredients = np.zeros(len(list_of_all_ingredients), dtype=np.uint8) for id_recipe in data: ingredients = data[id_recipe][ 'ingredients'] # Get ingredients_input current_sum = np.sum( ingredients_vector(ingredients, list_of_all_ingredients)) recipes_per_ingredients[current_sum] += 1 print 'Number of recipes per number of ingredients:\n', recipes_per_ingredients else: recipes_per_ingredients = values plt.bar(x_values, recipes_per_ingredients, align='center', width=0.5) plt.title('Number of recipes(images) per number of ingredients') plt.ylabel('number of recipes') plt.xlabel('number of ingredients') plt.grid(True) # plt.xticks(x_values) fig = matplotlib.pyplot.gcf() fig.set_size_inches(18.5, 12.0) plt.savefig(image_file) return recipes_per_ingredients
with codecs.open(fname, 'r', 'utf-8') as csvfile: header_reader = csv.reader(itertools.islice(csvfile, 0, 1), delimiter=',', quotechar='"') for elm in header_reader: header = elm reader = csv.DictReader(itertools.islice(csvfile, 1, None), delimiter=',', quotechar='"', fieldnames=header) for row in reader: ts = row['tweet_time'] tid = row['tweetid'] ti2tw.setdefault(ts, []).append(tid) c += 1 if c %1000 == 0: logging.info('Processed {}'.format(c)) csvfile.close() save_json(outname, ti2tw) if __name__=="__main__": fname = sys.argv[1] outname = sys.argv[2] setup_logging() #fname = '/home/mareike/PycharmProjects/sheffield/data/ira_tweets_csv_hashed.csv.1000' #outname = '/home/mareike/PycharmProjects/sheffield/data/time2tweet.json' timestamp2tweet_mapping(fname, outname) d = load_json(outname) print(len(d.keys()))
def split_data(json_file, data_dir, images_dir='images', train=0.9, validation_split=0.1, revert=False): """ Split the dataset into train, validation and test train: float value from 0 to 1 (test will be 1.0 - train - validation = test) specifying the amount of data for training and test validation_split: float value from 0 to 1 specifying the amount of data from training for validation. Example: train=0.9, validation_split=0.1 test will be 0.1 of the total data and validation will be 0.1 of the train data. revert: if True merge the folders 'train' and 'test' of images_path. """ random.seed(100) # Random number ids, data = recipes_ids(data_dir + json_file) images_path = data_dir + images_dir + '/' train_path = data_dir + 'train/' test_path = data_dir + 'test/' val_path = data_dir + 'val/' if revert: print 'TODO Reverting...' else: if myutils.directory_exists(train_path) or myutils.directory_exists(test_path): print 'Train or/and test folder already there. Returning...' # Loading test and train data data_train = myutils.load_json(data_dir + 'train.json') data_test = myutils.load_json(data_dir + 'test.json') data_val = myutils.load_json(data_dir + 'validation.json') return train_path, val_path, test_path, data_train, data_val, data_test data_train = {} data_test = {} data_val = {} size_dataset = len(data) samples_train = int(math.ceil(train * size_dataset)) samples_val = int(math.ceil(validation_split * samples_train)) samples_train = samples_train - samples_val samples_test = size_dataset - samples_train - samples_val print 'Total dataset={}, train={}, val={}, test={}'.format(size_dataset, samples_train, samples_val, samples_test) # Shuffle data to get random order of recipes random.shuffle(ids) # Get first samples for training, then validation, and the rest for test for index in range(0, samples_train): id_recipe = ids[index] data_train[id_recipe] = data[id_recipe] data.pop(id_recipe) # Removes the recipe # validation for index in range(samples_train, (samples_train + samples_val)): id_recipe = ids[index] data_val[id_recipe] = data[id_recipe] data.pop(id_recipe) # Removes the recipe data_test = data print 'Split data: {} for training (request={}), {} for validation (request={}),' \ ' and {} for test (request={})'.format(len(data_train), samples_train, len(data_val), samples_val, len(data_test), samples_test) myutils.save_json(data_dir + 'train.json', data_train) myutils.save_json(data_dir + 'test.json', data_test) myutils.save_json(data_dir + 'validation.json', data_val) # print 'Copying image files...' copy_images(images_path, train_path, data_train) copy_images(images_path, test_path, data_test) copy_images(images_path, val_path, data_val) return train_path, val_path, test_path, data_train, data_val, data_test
def remove_duplicates(file_name, path_data='../data/', folder='recipes-ctc'): """ Remove recipes that have (image) duplicates in the dataset.""" path_images = path_data + folder + '/images/' path_json_file = path_data + folder + '/' + file_name path_output_json_file = path_data + folder + '/pre-processed-' + file_name data = myutils.load_json(path_json_file) recipes_to_be_removed = [] ignore_list = [] if len(data) == 0: print 'Error: empty file.' else: print 'Total of {} recipes'.format(len(data)) # Compute all hashes first. recipes_hash = {} for recipe in data: current_file = open(path_images + data[recipe]['file_image']).read() hash_image = hashlib.md5(current_file).hexdigest() size_image = os.path.getsize(path_images + data[recipe]['file_image']) recipes_hash[recipe] = {} recipes_hash[recipe]['hash'] = hash_image recipes_hash[recipe]['size'] = size_image print 'All hashes were computed. :D' # Verifies if there are duplicates count = 0 for dict_index in data: if dict_index in recipes_to_be_removed or dict_index in ignore_list: continue print '{} Checking: {}, URL: {}'.format( count, data[dict_index]['file_image'], data[dict_index]['url']) list_entries = [] list_urls = [] # Compares with all other recipes index_achieved = False for dict_index_search in data: # Iterates till dict_index is achieved while not index_achieved and dict_index_search != dict_index: index_achieved = True continue # Ignores same index if (dict_index_search == dict_index) or (dict_index_search in recipes_to_be_removed)\ or (dict_index_search in ignore_list): continue # Ignore file with different sizes. Maybe we can delete this line, # since there is already the hash comparison if recipes_hash[dict_index]['size'] != recipes_hash[ dict_index_search]['size']: continue if recipes_hash[dict_index]['hash'] == recipes_hash[ dict_index_search]['hash']: print '--- Found duplicate: {}'.format( path_images + data[dict_index_search]['file_image']) list_entries.append(dict_index_search) list_urls.append(data[dict_index_search]['url']) count += 1 if len(list_urls) == 0: continue # User determines which recipe delete for url in list_urls: print url user_input = raw_input( "Which recipe do I remove? (None, ID or list of IDs separated by ,): " ) print 'user input = ', user_input.split(',') if user_input.lower() == 'none': print 'No recipe will be removed' ignore_list.append(dict_index) for id_recipe in list_entries: ignore_list.append(id_recipe) else: ids_to_be_removed = user_input.split(',') for id_recipe in ids_to_be_removed: id_recipe = id_recipe.lstrip().rstrip() recipes_to_be_removed.append(id_recipe) print 'Included id={} to be removed'.format(id_recipe) # Remove recipes remove_recipe(recipes_to_be_removed, data, (path_data + folder + '/')) # Save the new json file without duplicates print 'Saving data...' myutils.save_json(path_output_json_file, data)
def dist_samples_per_ingredient(data, file_ingredients, json_file=None, values=None, generate_figure=True, horizontal=True, percentage=True, image_file='images-per-ingredient.png'): """Returns the inverse class frequencies (distribution) of the ingredients in the data set. Let number_ingredients be the number of entries in file_ingredients. The distribution obeys the ordering in the file <file_ingredients>. return dictionary with number_of_samples (or percentage) per_ingredient (number_ingredients, 1) """ print 'Distribution of samples per ingredient...' list_of_all_ingredients = load_all_ingredients(file_ingredients) # x = np.arange(1, (len(list_of_all_ingredients))*2, 2) x_values = np.arange(1, len(list_of_all_ingredients) + 1) # Set the type of my list my_dtype = [('samples', np.float32), ('ingredient', 'S17')] result = np.zeros(len(list_of_all_ingredients), dtype=my_dtype) result['ingredient'] = list_of_all_ingredients # Add list of ingredients if values is None: if len(data) == 0: print 'Loading data...' data = load_json(json_file) samples_per_ingredient = np.zeros(len(list_of_all_ingredients), dtype=np.float32) for id_recipe in data: ingredients = data[id_recipe][ 'ingredients'] # Get ingredients_input current = samples_per_ingredient new_sample = ingredients_vector(ingredients, list_of_all_ingredients) samples_per_ingredient = current + new_sample # print 'Samples per ingredient:\n', samples_per_ingredient else: samples_per_ingredient = values if percentage: samples_per_ingredient = samples_per_ingredient / len(data) result[ 'samples'] = samples_per_ingredient # Add number of samples per ingredient print 'Shape samples_per_ing={}, result.shape={}'.format( samples_per_ingredient.shape, result.shape) print 'Samples per ingredient:\n', result #samples_per_ingredient if generate_figure: # Sort the list by the number of images result_sorted = np.sort(result, order='samples') # print result_sorted['samples'] if horizontal: # horizontal bar plt.barh(x_values - 0.4, result_sorted['samples'], align='center', height=0.7) plt.title('Images per ingredient') plt.ylabel('ingredient') plt.xlabel('number of images') plt.yticks(x_values - 0.4, result_sorted['ingredient']) plt.grid(True) fig = matplotlib.pyplot.gcf() fig.set_size_inches(13.0, 16.5) plt.savefig(image_file) else: # vertical bar plt.bar(x_values - 0.4, result_sorted['samples'], align='center', width=0.5) plt.title('Images per ingredient') plt.ylabel('number of images') plt.xlabel('ingredient') plt.xticks(x_values - 0.4, result_sorted['ingredient'], rotation=90) fig = matplotlib.pyplot.gcf() fig.set_size_inches(18.5, 12.0) plt.savefig(image_file) # Define the dictionary used for the class-weight of keras. Mapping class indices (integers) to a weight (float) # shape (classes, weights) # high_weight = 100 # +class_weight = {0:1,1:1,2:1,3:1,4:1,5:1,6:1,7:1,8:1,9:high_weight} # result_for_keras['weight'] for our custom loss my_dtype_keras = [('indices', np.int32), ('weight', np.float32)] result_for_keras = np.zeros(len(list_of_all_ingredients), dtype=my_dtype_keras) result_for_keras['indices'] = np.arange(1, len(list_of_all_ingredients) + 1) # Inverse class frequencies result_for_keras[ 'weight'] = 1.0 - samples_per_ingredient # Ordering of file_ingredients # Dict for keras index = 0 inverse_dist_ingredients = {} for index in range(0, len(list_of_all_ingredients)): inverse_dist_ingredients[ index] = 1.0 - samples_per_ingredient[0][index] # print result_for_keras['weight'].shape # print result_for_keras['weight'] return inverse_dist_ingredients, result_for_keras['weight']