コード例 #1
0
ファイル: data.py プロジェクト: kkedich/e-cooking
def sample(json_file, data_dir, number_samples=5, images_dir='images'):
    """Get sample: first <number_samples> of recipes dataset"""
    images_path = data_dir + images_dir + '/'
    new_images_path = data_dir + 'sample-images/'

    # Loading test and train data
    full_data = myutils.load_json(data_dir + json_file)

    # Get first recipes for the sample
    sample = {}
    count = 0
    for recipe in full_data:
        if count == number_samples:
            break

        sample[recipe] = full_data[recipe]
        count += 1

    print 'Sample: {} recipes'.format(count)
    myutils.save_json(data_dir + 'sample-{}.json'.format(str(count)), sample)

    print 'Copying image files...'
    copy_images(images_path, new_images_path, sample)

    return sample, new_images_path
コード例 #2
0
ファイル: pre_processing.py プロジェクト: kkedich/e-cooking
def clean_dataset(json_file, data_dir, output_file):
    """Removes html tags and html entities from the ingredients list of all recipes in <json_file>
       json_file: file.json with the data
       data_dir: directory in which the json_file is stored.
       output_file: output_file in which the pre-processed data will be saved.
    """
    data = myutils.load_json(data_dir + json_file)
    print 'Loaded {} recipes'.format(len(data))

    for recipe in data:
        raw_ingredients = data[recipe]['ingredients']

        final_ingredients = []
        for ingredient in raw_ingredients:
            final_ingredients.append(ingredients_utils.clean_html(ingredient))

        data[recipe]['ingredients'] = final_ingredients

        print raw_ingredients
        print 'novo:\n', final_ingredients
        print '\n'

    # Save pre-processed data
    myutils.save_json(data_dir + output_file, data)
    print 'Pre-processed data saved in: {}'.format(data_dir + output_file)
コード例 #3
0
def timestamp2tweet_mapping(fname, outname):
    """
    generate a mapping from a timestamp (str) to a list of tids
    :param fname:
    :return:
    """
    ti2tw = dict()
    c = 0

    with codecs.open(fname, 'r', 'utf-8') as csvfile:
        header_reader = csv.reader(itertools.islice(csvfile, 0, 1), delimiter=',', quotechar='"')
        for elm in header_reader:
            header = elm
        reader = csv.DictReader(itertools.islice(csvfile, 1, None), delimiter=',', quotechar='"', fieldnames=header)
        for row in reader:

            ts = row['tweet_time']
            tid = row['tweetid']
            ti2tw.setdefault(ts, []).append(tid)
            c += 1
            if c %1000 == 0:
                logging.info('Processed {}'.format(c))
    csvfile.close()
    save_json(outname, ti2tw)
コード例 #4
0
ファイル: data.py プロジェクト: kkedich/e-cooking
def split_data(json_file, data_dir, images_dir='images',
               train=0.9, validation_split=0.1, revert=False):
    """ Split the dataset into train, validation and test
       train: float value from 0 to 1 (test will be 1.0 - train - validation = test) specifying the amount of data for training
       and test
       validation_split: float value from 0 to 1 specifying the amount of data from training for validation.
                         Example: train=0.9, validation_split=0.1
                                  test will be 0.1 of the total data and validation will be 0.1 of the train data.
       revert: if True merge the folders 'train' and 'test' of images_path.
    """
    random.seed(100)  # Random number

    ids, data = recipes_ids(data_dir + json_file)

    images_path = data_dir + images_dir + '/'
    train_path = data_dir + 'train/'
    test_path = data_dir + 'test/'
    val_path = data_dir + 'val/'

    if revert:
        print 'TODO Reverting...'
    else:
        if myutils.directory_exists(train_path) or myutils.directory_exists(test_path):
            print 'Train or/and test folder already there. Returning...'

            # Loading test and train data
            data_train = myutils.load_json(data_dir + 'train.json')
            data_test = myutils.load_json(data_dir + 'test.json')
            data_val = myutils.load_json(data_dir + 'validation.json')

            return train_path, val_path, test_path, data_train, data_val, data_test

        data_train = {}
        data_test = {}
        data_val = {}

        size_dataset = len(data)
        samples_train = int(math.ceil(train * size_dataset))
        samples_val = int(math.ceil(validation_split * samples_train))
        samples_train = samples_train - samples_val
        samples_test = size_dataset - samples_train - samples_val

        print 'Total dataset={}, train={}, val={}, test={}'.format(size_dataset, samples_train, samples_val, samples_test)

        # Shuffle data to get random order of recipes
        random.shuffle(ids)

        # Get first samples for training, then validation, and the rest for test
        for index in range(0, samples_train):
            id_recipe = ids[index]

            data_train[id_recipe] = data[id_recipe]
            data.pop(id_recipe)  # Removes the recipe

        # validation
        for index in range(samples_train, (samples_train + samples_val)):
            id_recipe = ids[index]

            data_val[id_recipe] = data[id_recipe]
            data.pop(id_recipe)  # Removes the recipe

        data_test = data

        print 'Split data: {} for training (request={}), {} for validation (request={}),' \
              ' and {} for test (request={})'.format(len(data_train), samples_train,
                                                     len(data_val), samples_val,
                                                     len(data_test), samples_test)

        myutils.save_json(data_dir + 'train.json', data_train)
        myutils.save_json(data_dir + 'test.json', data_test)
        myutils.save_json(data_dir + 'validation.json', data_val)

        # print 'Copying image files...'
        copy_images(images_path, train_path, data_train)
        copy_images(images_path, test_path, data_test)
        copy_images(images_path, val_path, data_val)

        return train_path, val_path, test_path, data_train, data_val, data_test
コード例 #5
0
ファイル: pre_processing.py プロジェクト: kkedich/e-cooking
def remove_duplicates(file_name, path_data='../data/', folder='recipes-ctc'):
    """ Remove recipes that have (image) duplicates in the dataset."""

    path_images = path_data + folder + '/images/'
    path_json_file = path_data + folder + '/' + file_name
    path_output_json_file = path_data + folder + '/pre-processed-' + file_name

    data = myutils.load_json(path_json_file)
    recipes_to_be_removed = []
    ignore_list = []

    if len(data) == 0:
        print 'Error: empty file.'
    else:
        print 'Total of {} recipes'.format(len(data))

        # Compute all hashes first.
        recipes_hash = {}
        for recipe in data:
            current_file = open(path_images +
                                data[recipe]['file_image']).read()

            hash_image = hashlib.md5(current_file).hexdigest()
            size_image = os.path.getsize(path_images +
                                         data[recipe]['file_image'])

            recipes_hash[recipe] = {}
            recipes_hash[recipe]['hash'] = hash_image
            recipes_hash[recipe]['size'] = size_image

        print 'All hashes were computed. :D'

        # Verifies if there are duplicates
        count = 0
        for dict_index in data:
            if dict_index in recipes_to_be_removed or dict_index in ignore_list:
                continue
            print '{} Checking: {}, URL: {}'.format(
                count, data[dict_index]['file_image'], data[dict_index]['url'])

            list_entries = []
            list_urls = []

            # Compares with all other recipes
            index_achieved = False
            for dict_index_search in data:
                # Iterates till dict_index is achieved
                while not index_achieved and dict_index_search != dict_index:
                    index_achieved = True
                    continue

                # Ignores same index
                if (dict_index_search == dict_index) or (dict_index_search in recipes_to_be_removed)\
                        or (dict_index_search in ignore_list):
                    continue

                # Ignore file with different sizes. Maybe we can delete this line,
                # since there is already the hash comparison
                if recipes_hash[dict_index]['size'] != recipes_hash[
                        dict_index_search]['size']:
                    continue

                if recipes_hash[dict_index]['hash'] == recipes_hash[
                        dict_index_search]['hash']:
                    print '--- Found duplicate: {}'.format(
                        path_images + data[dict_index_search]['file_image'])

                    list_entries.append(dict_index_search)
                    list_urls.append(data[dict_index_search]['url'])

            count += 1
            if len(list_urls) == 0:
                continue

            # User determines which recipe delete
            for url in list_urls:
                print url

            user_input = raw_input(
                "Which recipe do I remove? (None, ID or list of IDs separated by ,): "
            )
            print 'user input = ', user_input.split(',')

            if user_input.lower() == 'none':
                print 'No recipe will be removed'
                ignore_list.append(dict_index)
                for id_recipe in list_entries:
                    ignore_list.append(id_recipe)
            else:
                ids_to_be_removed = user_input.split(',')

                for id_recipe in ids_to_be_removed:
                    id_recipe = id_recipe.lstrip().rstrip()
                    recipes_to_be_removed.append(id_recipe)
                    print 'Included id={} to be removed'.format(id_recipe)

        # Remove recipes
        remove_recipe(recipes_to_be_removed, data, (path_data + folder + '/'))

        # Save the new json file without duplicates
        print 'Saving data...'
        myutils.save_json(path_output_json_file, data)