Exemple #1
0
def read_cached_ingredients_words(
        file='../data/full-recipes-dataset/pre-processed-full-recipes-dataset-v2.json',
        file_words='../data/words-teste.txt'):
    """ Returns a list with all words from all ingredients from all recipes of the dataset."""
    print 'Reading ingredients of all recipes'
    config = myutils.load_json(file)

    ingr_word_list = []
    if not os.path.exists(file_words):
        with open(file_words, 'w') as f:

            for recipe in config.keys():
                for ingredient in config[recipe]["ingredients"]:
                    # ingredient = ingredients_utils.clean_html(ingredient)
                    ingredient = ingredients_utils.clean(ingredient)
                    ingredient = ingredients_utils.clean_recipes_terms(
                        ingredient)

                    word_list = ingredient.split()
                    for w in word_list:
                        if len(w) == 1:  # Removing words with just one letter.
                            continue
                        f.write(w + '\n')

    print 'Saving words...'
    with open(file_words, 'r') as f:
        ingr_word_list = [line.rstrip('\n') for line in f]

    return ingr_word_list
Exemple #2
0
def sample(json_file, data_dir, number_samples=5, images_dir='images'):
    """Get sample: first <number_samples> of recipes dataset"""
    images_path = data_dir + images_dir + '/'
    new_images_path = data_dir + 'sample-images/'

    # Loading test and train data
    full_data = myutils.load_json(data_dir + json_file)

    # Get first recipes for the sample
    sample = {}
    count = 0
    for recipe in full_data:
        if count == number_samples:
            break

        sample[recipe] = full_data[recipe]
        count += 1

    print 'Sample: {} recipes'.format(count)
    myutils.save_json(data_dir + 'sample-{}.json'.format(str(count)), sample)

    print 'Copying image files...'
    copy_images(images_path, new_images_path, sample)

    return sample, new_images_path
Exemple #3
0
def clean_dataset(json_file, data_dir, output_file):
    """Removes html tags and html entities from the ingredients list of all recipes in <json_file>
       json_file: file.json with the data
       data_dir: directory in which the json_file is stored.
       output_file: output_file in which the pre-processed data will be saved.
    """
    data = myutils.load_json(data_dir + json_file)
    print 'Loaded {} recipes'.format(len(data))

    for recipe in data:
        raw_ingredients = data[recipe]['ingredients']

        final_ingredients = []
        for ingredient in raw_ingredients:
            final_ingredients.append(ingredients_utils.clean_html(ingredient))

        data[recipe]['ingredients'] = final_ingredients

        print raw_ingredients
        print 'novo:\n', final_ingredients
        print '\n'

    # Save pre-processed data
    myutils.save_json(data_dir + output_file, data)
    print 'Pre-processed data saved in: {}'.format(data_dir + output_file)
Exemple #4
0
def recipes_ids(json_file):
    """Get the ids of all recipes in a list.
       Format of Json file: each recipe is a dictionary."""
    data = myutils.load_json(json_file)

    ids = []
    for id_recipe in data:
        ids.append(id_recipe)

    return ids, data
Exemple #5
0
def fig_ingredients_per_recipe(json_file,
                               file_ingredients,
                               values=None,
                               image_file='ingredients_per_recipe.png'):
    """Returns number of recipes(images) per number of ingredients.
       Example: we have 1000 recipes that have 2 ingredients...
    """
    list_of_all_ingredients = load_all_ingredients(file_ingredients)
    x_values = np.arange(1, len(list_of_all_ingredients) + 1)

    if values is None:
        print 'Loading data...'
        data = load_json(json_file)

        recipes_per_ingredients = np.zeros(len(list_of_all_ingredients),
                                           dtype=np.uint8)

        for id_recipe in data:
            ingredients = data[id_recipe][
                'ingredients']  # Get ingredients_input
            current_sum = np.sum(
                ingredients_vector(ingredients, list_of_all_ingredients))
            recipes_per_ingredients[current_sum] += 1

        print 'Number of recipes per number of ingredients:\n', recipes_per_ingredients
    else:
        recipes_per_ingredients = values

    plt.bar(x_values, recipes_per_ingredients, align='center', width=0.5)
    plt.title('Number of recipes(images) per number of ingredients')
    plt.ylabel('number of recipes')
    plt.xlabel('number of ingredients')
    plt.grid(True)
    # plt.xticks(x_values)

    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(18.5, 12.0)
    plt.savefig(image_file)

    return recipes_per_ingredients
Exemple #6
0
    with codecs.open(fname, 'r', 'utf-8') as csvfile:
        header_reader = csv.reader(itertools.islice(csvfile, 0, 1), delimiter=',', quotechar='"')
        for elm in header_reader:
            header = elm
        reader = csv.DictReader(itertools.islice(csvfile, 1, None), delimiter=',', quotechar='"', fieldnames=header)
        for row in reader:

            ts = row['tweet_time']
            tid = row['tweetid']
            ti2tw.setdefault(ts, []).append(tid)
            c += 1
            if c %1000 == 0:
                logging.info('Processed {}'.format(c))
    csvfile.close()
    save_json(outname, ti2tw)


if __name__=="__main__":

    fname = sys.argv[1]
    outname = sys.argv[2]
    setup_logging()
    #fname = '/home/mareike/PycharmProjects/sheffield/data/ira_tweets_csv_hashed.csv.1000'
    #outname = '/home/mareike/PycharmProjects/sheffield/data/time2tweet.json'

    timestamp2tweet_mapping(fname, outname)

    d = load_json(outname)
    print(len(d.keys()))
Exemple #7
0
def split_data(json_file, data_dir, images_dir='images',
               train=0.9, validation_split=0.1, revert=False):
    """ Split the dataset into train, validation and test
       train: float value from 0 to 1 (test will be 1.0 - train - validation = test) specifying the amount of data for training
       and test
       validation_split: float value from 0 to 1 specifying the amount of data from training for validation.
                         Example: train=0.9, validation_split=0.1
                                  test will be 0.1 of the total data and validation will be 0.1 of the train data.
       revert: if True merge the folders 'train' and 'test' of images_path.
    """
    random.seed(100)  # Random number

    ids, data = recipes_ids(data_dir + json_file)

    images_path = data_dir + images_dir + '/'
    train_path = data_dir + 'train/'
    test_path = data_dir + 'test/'
    val_path = data_dir + 'val/'

    if revert:
        print 'TODO Reverting...'
    else:
        if myutils.directory_exists(train_path) or myutils.directory_exists(test_path):
            print 'Train or/and test folder already there. Returning...'

            # Loading test and train data
            data_train = myutils.load_json(data_dir + 'train.json')
            data_test = myutils.load_json(data_dir + 'test.json')
            data_val = myutils.load_json(data_dir + 'validation.json')

            return train_path, val_path, test_path, data_train, data_val, data_test

        data_train = {}
        data_test = {}
        data_val = {}

        size_dataset = len(data)
        samples_train = int(math.ceil(train * size_dataset))
        samples_val = int(math.ceil(validation_split * samples_train))
        samples_train = samples_train - samples_val
        samples_test = size_dataset - samples_train - samples_val

        print 'Total dataset={}, train={}, val={}, test={}'.format(size_dataset, samples_train, samples_val, samples_test)

        # Shuffle data to get random order of recipes
        random.shuffle(ids)

        # Get first samples for training, then validation, and the rest for test
        for index in range(0, samples_train):
            id_recipe = ids[index]

            data_train[id_recipe] = data[id_recipe]
            data.pop(id_recipe)  # Removes the recipe

        # validation
        for index in range(samples_train, (samples_train + samples_val)):
            id_recipe = ids[index]

            data_val[id_recipe] = data[id_recipe]
            data.pop(id_recipe)  # Removes the recipe

        data_test = data

        print 'Split data: {} for training (request={}), {} for validation (request={}),' \
              ' and {} for test (request={})'.format(len(data_train), samples_train,
                                                     len(data_val), samples_val,
                                                     len(data_test), samples_test)

        myutils.save_json(data_dir + 'train.json', data_train)
        myutils.save_json(data_dir + 'test.json', data_test)
        myutils.save_json(data_dir + 'validation.json', data_val)

        # print 'Copying image files...'
        copy_images(images_path, train_path, data_train)
        copy_images(images_path, test_path, data_test)
        copy_images(images_path, val_path, data_val)

        return train_path, val_path, test_path, data_train, data_val, data_test
Exemple #8
0
def remove_duplicates(file_name, path_data='../data/', folder='recipes-ctc'):
    """ Remove recipes that have (image) duplicates in the dataset."""

    path_images = path_data + folder + '/images/'
    path_json_file = path_data + folder + '/' + file_name
    path_output_json_file = path_data + folder + '/pre-processed-' + file_name

    data = myutils.load_json(path_json_file)
    recipes_to_be_removed = []
    ignore_list = []

    if len(data) == 0:
        print 'Error: empty file.'
    else:
        print 'Total of {} recipes'.format(len(data))

        # Compute all hashes first.
        recipes_hash = {}
        for recipe in data:
            current_file = open(path_images +
                                data[recipe]['file_image']).read()

            hash_image = hashlib.md5(current_file).hexdigest()
            size_image = os.path.getsize(path_images +
                                         data[recipe]['file_image'])

            recipes_hash[recipe] = {}
            recipes_hash[recipe]['hash'] = hash_image
            recipes_hash[recipe]['size'] = size_image

        print 'All hashes were computed. :D'

        # Verifies if there are duplicates
        count = 0
        for dict_index in data:
            if dict_index in recipes_to_be_removed or dict_index in ignore_list:
                continue
            print '{} Checking: {}, URL: {}'.format(
                count, data[dict_index]['file_image'], data[dict_index]['url'])

            list_entries = []
            list_urls = []

            # Compares with all other recipes
            index_achieved = False
            for dict_index_search in data:
                # Iterates till dict_index is achieved
                while not index_achieved and dict_index_search != dict_index:
                    index_achieved = True
                    continue

                # Ignores same index
                if (dict_index_search == dict_index) or (dict_index_search in recipes_to_be_removed)\
                        or (dict_index_search in ignore_list):
                    continue

                # Ignore file with different sizes. Maybe we can delete this line,
                # since there is already the hash comparison
                if recipes_hash[dict_index]['size'] != recipes_hash[
                        dict_index_search]['size']:
                    continue

                if recipes_hash[dict_index]['hash'] == recipes_hash[
                        dict_index_search]['hash']:
                    print '--- Found duplicate: {}'.format(
                        path_images + data[dict_index_search]['file_image'])

                    list_entries.append(dict_index_search)
                    list_urls.append(data[dict_index_search]['url'])

            count += 1
            if len(list_urls) == 0:
                continue

            # User determines which recipe delete
            for url in list_urls:
                print url

            user_input = raw_input(
                "Which recipe do I remove? (None, ID or list of IDs separated by ,): "
            )
            print 'user input = ', user_input.split(',')

            if user_input.lower() == 'none':
                print 'No recipe will be removed'
                ignore_list.append(dict_index)
                for id_recipe in list_entries:
                    ignore_list.append(id_recipe)
            else:
                ids_to_be_removed = user_input.split(',')

                for id_recipe in ids_to_be_removed:
                    id_recipe = id_recipe.lstrip().rstrip()
                    recipes_to_be_removed.append(id_recipe)
                    print 'Included id={} to be removed'.format(id_recipe)

        # Remove recipes
        remove_recipe(recipes_to_be_removed, data, (path_data + folder + '/'))

        # Save the new json file without duplicates
        print 'Saving data...'
        myutils.save_json(path_output_json_file, data)
Exemple #9
0
def dist_samples_per_ingredient(data,
                                file_ingredients,
                                json_file=None,
                                values=None,
                                generate_figure=True,
                                horizontal=True,
                                percentage=True,
                                image_file='images-per-ingredient.png'):
    """Returns the inverse class frequencies (distribution) of the ingredients in the data set.

       Let number_ingredients be the number of entries in file_ingredients.
       The distribution obeys the ordering in the file <file_ingredients>.
       return dictionary with number_of_samples (or percentage) per_ingredient (number_ingredients, 1)
    """
    print 'Distribution of samples per ingredient...'

    list_of_all_ingredients = load_all_ingredients(file_ingredients)
    # x = np.arange(1, (len(list_of_all_ingredients))*2, 2)
    x_values = np.arange(1, len(list_of_all_ingredients) + 1)

    # Set the type of my list
    my_dtype = [('samples', np.float32), ('ingredient', 'S17')]
    result = np.zeros(len(list_of_all_ingredients), dtype=my_dtype)

    result['ingredient'] = list_of_all_ingredients  # Add list of ingredients

    if values is None:
        if len(data) == 0:
            print 'Loading data...'
            data = load_json(json_file)

        samples_per_ingredient = np.zeros(len(list_of_all_ingredients),
                                          dtype=np.float32)

        for id_recipe in data:
            ingredients = data[id_recipe][
                'ingredients']  # Get ingredients_input
            current = samples_per_ingredient
            new_sample = ingredients_vector(ingredients,
                                            list_of_all_ingredients)
            samples_per_ingredient = current + new_sample
        # print 'Samples per ingredient:\n', samples_per_ingredient
    else:
        samples_per_ingredient = values

    if percentage:
        samples_per_ingredient = samples_per_ingredient / len(data)

    result[
        'samples'] = samples_per_ingredient  # Add number of samples per ingredient

    print 'Shape samples_per_ing={}, result.shape={}'.format(
        samples_per_ingredient.shape, result.shape)
    print 'Samples per ingredient:\n', result  #samples_per_ingredient

    if generate_figure:
        # Sort the list by the number of images
        result_sorted = np.sort(result, order='samples')
        # print result_sorted['samples']

        if horizontal:
            # horizontal bar
            plt.barh(x_values - 0.4,
                     result_sorted['samples'],
                     align='center',
                     height=0.7)
            plt.title('Images per ingredient')
            plt.ylabel('ingredient')
            plt.xlabel('number of images')
            plt.yticks(x_values - 0.4, result_sorted['ingredient'])
            plt.grid(True)

            fig = matplotlib.pyplot.gcf()
            fig.set_size_inches(13.0, 16.5)
            plt.savefig(image_file)
        else:
            # vertical bar
            plt.bar(x_values - 0.4,
                    result_sorted['samples'],
                    align='center',
                    width=0.5)
            plt.title('Images per ingredient')
            plt.ylabel('number of images')
            plt.xlabel('ingredient')
            plt.xticks(x_values - 0.4,
                       result_sorted['ingredient'],
                       rotation=90)

            fig = matplotlib.pyplot.gcf()
            fig.set_size_inches(18.5, 12.0)
            plt.savefig(image_file)

    # Define the dictionary used for the class-weight of keras. Mapping class indices (integers) to a weight (float)
    # shape (classes, weights)
    # high_weight = 100
    # +class_weight = {0:1,1:1,2:1,3:1,4:1,5:1,6:1,7:1,8:1,9:high_weight}
    # result_for_keras['weight'] for our custom loss
    my_dtype_keras = [('indices', np.int32), ('weight', np.float32)]
    result_for_keras = np.zeros(len(list_of_all_ingredients),
                                dtype=my_dtype_keras)
    result_for_keras['indices'] = np.arange(1,
                                            len(list_of_all_ingredients) + 1)
    # Inverse class frequencies
    result_for_keras[
        'weight'] = 1.0 - samples_per_ingredient  # Ordering of file_ingredients

    # Dict for keras
    index = 0
    inverse_dist_ingredients = {}
    for index in range(0, len(list_of_all_ingredients)):
        inverse_dist_ingredients[
            index] = 1.0 - samples_per_ingredient[0][index]

    # print result_for_keras['weight'].shape
    # print result_for_keras['weight']
    return inverse_dist_ingredients, result_for_keras['weight']