def loadMSVD():
    logging.info('Loading MSVD dataset')

    # Build basic dataset structure
    #    we assign it a name and the path were the images are stored

    base_path = '/media/HDD_2TB/DATASETS/MSVD/'
    name = 'MSVD_VideoDescription'
    ds = Dataset(name, base_path)
    max_text_len = 35

    # Let's load the train, val and test splits of the descriptions (outputs)
    #    the files include a description per line. In this dataset a variable number
    #    of descriptions per video are provided.

    ds.setOutput(base_path + 'train_descriptions.txt', 'train',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len)
    ds.setOutput(base_path + 'val_descriptions.txt', 'val',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)
    ds.setOutput(base_path + 'test_descriptions.txt', 'test',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)

    # Let's load the associated videos (inputs)
    #    we must take into account that in this dataset we have a different number of sentences per video, 
    #    for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list
    #    containing the number of captions in each video.

    num_captions_train = np.load(base_path + 'train_descriptions_counts.npy')
    num_captions_val = np.load(base_path + 'val_descriptions_counts.npy')
    num_captions_test = np.load(base_path + 'test_descriptions_counts.npy')

    ds.setInput([base_path + 'train_imgs_list.txt', base_path + 'train_imgs_counts.txt'],
                'train', type='video', id='videos',
                repeat_set=num_captions_train)
    ds.setInput([base_path + 'val_imgs_list.txt', base_path + 'val_imgs_counts.txt'],
                'val', type='video', id='videos',
                repeat_set=num_captions_val)
    ds.setInput([base_path + 'test_imgs_list.txt', base_path + 'test_imgs_counts.txt'],
                'test', type='video', id='videos',
                repeat_set=num_captions_test)

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], id='videos')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly.')
Beispiel #2
0
def loadMSVD():
    logging.info('Loading MSVD dataset')

    # Build basic dataset structure
    #    we assign it a name and the path were the images are stored

    base_path = '/media/HDD_2TB/DATASETS/MSVD/'
    name = 'MSVD_VideoDescription'
    ds = Dataset(name, base_path)
    max_text_len = 35

    # Let's load the train, val and test splits of the descriptions (outputs)
    #    the files include a description per line. In this dataset a variable number
    #    of descriptions per video are provided.

    ds.setOutput(base_path + 'train_descriptions.txt', 'train',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len)
    ds.setOutput(base_path + 'val_descriptions.txt', 'val',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)
    ds.setOutput(base_path + 'test_descriptions.txt', 'test',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)

    # Let's load the associated videos (inputs)
    #    we must take into account that in this dataset we have a different number of sentences per video,
    #    for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list
    #    containing the number of captions in each video.

    num_captions_train = np.load(base_path + 'train_descriptions_counts.npy')
    num_captions_val = np.load(base_path + 'val_descriptions_counts.npy')
    num_captions_test = np.load(base_path + 'test_descriptions_counts.npy')

    ds.setInput([base_path + 'train_imgs_list.txt', base_path + 'train_imgs_counts.txt'],
                'train', type='video', id='videos',
                repeat_set=num_captions_train)
    ds.setInput([base_path + 'val_imgs_list.txt', base_path + 'val_imgs_counts.txt'],
                'val', type='video', id='videos',
                repeat_set=num_captions_val)
    ds.setInput([base_path + 'test_imgs_list.txt', base_path + 'test_imgs_counts.txt'],
                'test', type='video', id='videos',
                repeat_set=num_captions_test)

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], data_id='videos')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly.')
Beispiel #3
0
def loadFood101():
    logging.info('Loading Food101 dataset')
    logging.info(
        'INFO: in order to load this dataset it must be placed in ../data/Food101/images/ after downloading it form https://www.vision.ee.ethz.ch/datasets_extra/food-101/'
    )

    base_path = '../data/Food101/'
    name = 'Food101'
    ds = Dataset(name, base_path + 'images')

    # Insert inputs (images)
    ds.setInput(base_path + 'meta/train_split.txt',
                'train',
                type='image',
                id='images',
                img_size_crop=[227, 227, 3])
    ds.setInput(base_path + 'meta/val_split.txt',
                'val',
                type='image',
                id='images')
    ds.setInput(base_path + 'meta/test.txt', 'test', type='image', id='images')

    # Insert outputs (labels)
    ds.setOutput(base_path + 'meta/train_labels.txt',
                 'train',
                 type='categorical',
                 id='labels')
    ds.setOutput(base_path + 'meta/val_labels.txt',
                 'val',
                 type='categorical',
                 id='labels')
    ds.setOutput(base_path + 'meta/test_labels.txt',
                 'test',
                 type='categorical',
                 id='labels')

    # Set list of classes (strings)
    ds.setClasses(base_path + 'meta/classes.txt', 'labels')

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067],
                    data_id='images')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly.')
def loadFlickr8k():
    logging.info('Loading Flickr8k dataset')

    # Build basic dataset structure
    #    we assign it a name and the path were the images are stored

    base_path = '/media/HDD_2TB/DATASETS/Flickr8k/'
    name = 'Flickr8k_ImageDescription'
    ds = Dataset(name, base_path + 'Flicker8k_Dataset')
    max_text_len = 35

    # Let's load the train, val and test splits of the descriptions (outputs)
    #    the files include a description per line 
    #    and a set of 5 consecutive descriptions correspond to a single input image

    ds.setOutput(base_path + 'text/train_descriptions.txt', 'train',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len)
    ds.setOutput(base_path + 'text/val_descriptions.txt', 'val',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)
    ds.setOutput(base_path + 'text/test_descriptions.txt', 'test',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)

    # Let's load the associated images (inputs)
    #    we must take into account that in this dataset we have 5 sentences per image, 
    #    for this reason we introduce the parameter 'repeat_set'=5

    ds.setInput(base_path + 'text/Flickr_8k.trainImages.txt', 'train',
                type='image', id='images', repeat_set=5)
    ds.setInput(base_path + 'text/Flickr_8k.devImages.txt', 'val',
                type='image', id='images', repeat_set=5)
    ds.setInput(base_path + 'text/Flickr_8k.testImages.txt', 'test',
                type='image', id='images', repeat_set=5)

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], id='images')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly.')
Beispiel #5
0
def loadFlickr8k():
    logging.info('Loading Flickr8k dataset')

    # Build basic dataset structure
    #    we assign it a name and the path were the images are stored

    base_path = '/media/HDD_2TB/DATASETS/Flickr8k/'
    name = 'Flickr8k_ImageDescription'
    ds = Dataset(name, base_path + 'Flicker8k_Dataset')
    max_text_len = 35

    # Let's load the train, val and test splits of the descriptions (outputs)
    #    the files include a description per line
    #    and a set of 5 consecutive descriptions correspond to a single input image

    ds.setOutput(base_path + 'text/train_descriptions.txt', 'train',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len)
    ds.setOutput(base_path + 'text/val_descriptions.txt', 'val',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)
    ds.setOutput(base_path + 'text/test_descriptions.txt', 'test',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)

    # Let's load the associated images (inputs)
    #    we must take into account that in this dataset we have 5 sentences per image,
    #    for this reason we introduce the parameter 'repeat_set'=5

    ds.setInput(base_path + 'text/Flickr_8k.trainImages.txt', 'train', type='image', id='images', repeat_set=5)
    ds.setInput(base_path + 'text/Flickr_8k.devImages.txt', 'val', type='image', id='images', repeat_set=5)
    ds.setInput(base_path + 'text/Flickr_8k.testImages.txt', 'test', type='image', id='images', repeat_set=5)

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], data_id='images')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly. %d input samples. %d output samples' % (len(X), len(Y)))
Beispiel #6
0
def loadFood101():
    logging.info('Loading Food101 dataset')
    logging.info(
        'INFO: in order to load this dataset it must be placed in ../data/Food101/images/ after downloading it form https://www.vision.ee.ethz.ch/datasets_extra/food-101/')

    base_path = '../data/Food101/'
    name = 'Food101'
    ds = Dataset(name, base_path + 'images')

    # Insert inputs (images)
    ds.setInput(base_path + 'meta/train_split.txt', 'train',
                type='image', id='images', img_size_crop=[227, 227, 3])
    ds.setInput(base_path + 'meta/val_split.txt', 'val',
                type='image', id='images')
    ds.setInput(base_path + 'meta/test.txt', 'test',
                type='image', id='images')

    # Insert outputs (labels)
    ds.setOutput(base_path + 'meta/train_labels.txt', 'train',
                 type='categorical', id='labels')
    ds.setOutput(base_path + 'meta/val_labels.txt', 'val',
                 type='categorical', id='labels')
    ds.setOutput(base_path + 'meta/test_labels.txt', 'test',
                 type='categorical', id='labels')

    # Set list of classes (strings)
    ds.setClasses(base_path + 'meta/classes.txt', 'labels')

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], data_id='images')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly.')
def build_dataset(params):
    """
    Builds (or loads) a Dataset instance.
    :param params: Parameters specifying Dataset options
    :return: Dataset object
    """

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if params['VERBOSE'] > 0:
            silence = False
            logger.info('Building ' + params['DATASET_NAME'] + '_' +
                        params['SRC_LAN'] + params['TRG_LAN'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[
            'TRG_LAN']
        ds = Dataset(name, base_path, silence=silence)

        # OUTPUT DATA
        # Load the train, val and test splits of the target language sentences (outputs). The files include a sentence per line.
        ds.setOutput(
            os.path.join(base_path,
                         params['TEXT_FILES']['train'] + params['TRG_LAN']),
            'train',
            type=params.get(
                'OUTPUTS_TYPES_DATASET',
                ['dense-text'] if 'sparse' in params['LOSS'] else ['text'])[0],
            id=params['OUTPUTS_IDS_DATASET'][0],
            tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'),
            build_vocabulary=True,
            pad_on_batch=params.get('PAD_ON_BATCH', True),
            sample_weights=params.get('SAMPLE_WEIGHTS', True),
            fill=params.get('FILL', 'end'),
            max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
            min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0),
            bpe_codes=params.get('BPE_CODES_PATH', None),
            label_smoothing=params.get('LABEL_SMOOTHING', 0.))

        for split in ['val', 'test']:
            if params['TEXT_FILES'].get(split) is not None:
                ds.setOutput(
                    os.path.join(
                        base_path,
                        params['TEXT_FILES'][split] + params['TRG_LAN']),
                    split,
                    type=
                    'text',  # The type of the references should be always 'text'
                    id=params['OUTPUTS_IDS_DATASET'][0],
                    pad_on_batch=params.get('PAD_ON_BATCH', True),
                    tokenization=params.get('TOKENIZATION_METHOD',
                                            'tokenize_none'),
                    sample_weights=params.get('SAMPLE_WEIGHTS', True),
                    max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                    max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                    bpe_codes=params.get('BPE_CODES_PATH', None),
                    label_smoothing=0.)

        # INPUT DATA
        # We must ensure that the 'train' split is the first (for building the vocabulary)
        for split in params['TEXT_FILES']:
            build_vocabulary = split == 'train'
            ds.setInput(os.path.join(
                base_path, params['TEXT_FILES'][split] + params['SRC_LAN']),
                        split,
                        type=params.get('INPUTS_TYPES_DATASET',
                                        ['text', 'text'])[0],
                        id=params['INPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        build_vocabulary=build_vocabulary,
                        fill=params.get('FILL', 'end'),
                        max_text_len=params.get('MAX_INPUT_TEXT_LEN', 70),
                        max_words=params.get('INPUT_VOCABULARY_SIZE', 0),
                        min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))

            if len(params['INPUTS_IDS_DATASET']) > 1:
                if 'train' in split:
                    ds.setInput(
                        os.path.join(
                            base_path,
                            params['TEXT_FILES'][split] + params['TRG_LAN']),
                        split,
                        type=params.get('INPUTS_TYPES_DATASET',
                                        ['text', 'text'])[1],
                        id=params['INPUTS_IDS_DATASET'][1],
                        required=False,
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                        offset=1,
                        fill=params.get('FILL', 'end'),
                        max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                        max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))
                    if params.get('TIE_EMBEDDINGS', False):
                        ds.merge_vocabularies([
                            params['INPUTS_IDS_DATASET'][1],
                            params['INPUTS_IDS_DATASET'][0]
                        ])
                else:
                    ds.setInput(None,
                                split,
                                type='ghost',
                                id=params['INPUTS_IDS_DATASET'][-1],
                                required=False)
            if params.get('ALIGN_FROM_RAW', True) and not params.get(
                    'HOMOGENEOUS_BATCHES', False):
                ds.setRawInput(os.path.join(
                    base_path,
                    params['TEXT_FILES'][split] + params['SRC_LAN']),
                               split,
                               type='file-name',
                               id='raw_' + params['INPUTS_IDS_DATASET'][0])
        if params.get('POS_UNK', False):
            if params.get('HEURISTIC', 0) > 0:
                ds.loadMapping(params['MAPPING'])
        # Prepare references
        prepare_references(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])

    else:
        # We can easily recover it with a single line
        ds = loadDataset(
            os.path.join(
                params['DATASET_STORE_PATH'],
                'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] +
                params['TRG_LAN'] + '.pkl'))

        # Prepare references
        prepare_references(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS'])

    return ds
Beispiel #8
0
            max_text_len=30,
            max_words=30000)

ds.setInput(None,
            'val',
            type='ghost',
            id='state_below',
            pad_on_batch=True,
            required=False)

ds.setOutput('data/Cornell_train_reply.en',
             'train',
             type='text',
             id='target_text',
             tokenization='tokenize_basic',
             build_vocabulary=True,
             pad_on_batch=True,
             sample_weights=True,
             max_text_len=30,
             max_words=30000,
             min_occ=0)

ds.setOutput('data/Cornell_valid_reply.en',
             'val',
             type='text',
             id='target_text',
             pad_on_batch=True,
             tokenization='tokenize_basic',
             sample_weights=True,
             max_text_len=30,
             max_words=0)
Beispiel #9
0
def build_dataset(params):

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if (params['VERBOSE'] > 0):
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)
        max_text_len = params['MAX_INPUT_TEXT_LEN']

        ##### INPUT DATA
        ### QUESTIONS
        ds.setInput(base_path + '/' + params['QST_FILES']['train'][0],
                    'train',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][0],
                    tokenization=params['TOKENIZATION_METHOD'],
                    build_vocabulary=True,
                    fill=params['FILL'],
                    max_text_len=params['MAX_INPUT_TEXT_LEN'],
                    max_words=params['INPUT_VOCABULARY_SIZE'],
                    repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path + '/' + params['QST_FILES']['val'][0],
                    'val',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][0],
                    tokenization=params['TOKENIZATION_METHOD'],
                    fill=params['FILL'],
                    max_text_len=params['MAX_INPUT_TEXT_LEN'],
                    max_words=params['INPUT_VOCABULARY_SIZE'],
                    repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path + '/' + params['QST_FILES']['test'][0],
                    'test',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][0],
                    tokenization=params['TOKENIZATION_METHOD'],
                    fill=params['FILL'],
                    max_text_len=params['MAX_INPUT_TEXT_LEN'],
                    max_words=params['INPUT_VOCABULARY_SIZE'],
                    repeat_set=params['REPEAT_QST'])
        ### QUESTIONS' associated IDs
        ds.setInput(base_path + '/' + params['QST_FILES']['train'][1],
                    'train',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][0] + '_ids',
                    repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path + '/' + params['QST_FILES']['val'][1],
                    'val',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][0] + '_ids',
                    repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path + '/' + params['QST_FILES']['test'][1],
                    'test',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][0] + '_ids',
                    repeat_set=params['REPEAT_QST'])

        ### IMAGES
        ds.setInput(base_path + '/' + params['IMG_FILES']['train'][0],
                    'train',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][1],
                    feat_len=params['IMG_FEAT_SIZE'],
                    repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path + '/' + params['IMG_FILES']['val'][0],
                    'val',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][1],
                    feat_len=params['IMG_FEAT_SIZE'],
                    repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path + '/' + params['IMG_FILES']['test'][0],
                    'test',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][1],
                    feat_len=params['IMG_FEAT_SIZE'],
                    repeat_set=params['REPEAT_IMG'])
        ### IMAGES' associated IDs
        ds.setInput(base_path + '/' + params['IMG_FILES']['train'][1],
                    'train',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][1] + '_ids',
                    repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path + '/' + params['IMG_FILES']['val'][1],
                    'val',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][1] + '_ids',
                    repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path + '/' + params['IMG_FILES']['test'][1],
                    'test',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][1] + '_ids',
                    repeat_set=params['REPEAT_IMG'])

        ##### OUTPUT DATA
        ### ANSWERS
        ds.setOutput(base_path + '/' + params['ANS_FILES']['train'][0],
                     'train',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     tokenization=params['TOKENIZATION_METHOD'],
                     build_vocabulary=True,
                     fill=params['FILL'],
                     max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                     max_words=params['OUTPUT_VOCABULARY_SIZE'])
        ds.setOutput(base_path + '/' + params['ANS_FILES']['val'][0],
                     'val',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     tokenization=params['TOKENIZATION_METHOD'],
                     fill=params['FILL'],
                     max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                     max_words=params['OUTPUT_VOCABULARY_SIZE'])
        if 'test' in params['ANS_FILES']:
            ds.setOutput(base_path + '/' + params['ANS_FILES']['test'][0],
                         'test',
                         type='text',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         tokenization=params['TOKENIZATION_METHOD'],
                         fill=params['FILL'],
                         max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                         max_words=params['OUTPUT_VOCABULARY_SIZE'])

        # Load extra variables (we need the original path to questions and annotations for VQA evaluation)
        ds.extra_variables['train'] = dict()
        ds.extra_variables['val'] = dict()
        ds.extra_variables['test'] = dict()

        ds.extra_variables['train'][
            'quesFile'] = base_path + '/' + params['QST_FILES']['train'][2]
        ds.extra_variables['val'][
            'quesFile'] = base_path + '/' + params['QST_FILES']['val'][2]
        ds.extra_variables['test'][
            'quesFile'] = base_path + '/' + params['QST_FILES']['test'][2]

        ds.extra_variables['train'][
            'annFile'] = base_path + '/' + params['ANS_FILES']['train'][1]
        ds.extra_variables['val'][
            'annFile'] = base_path + '/' + params['ANS_FILES']['val'][1]
        if 'test' in params['ANS_FILES']:
            ds.extra_variables['test'][
                'annFile'] = base_path + '/' + params['ANS_FILES']['tes'][1]

        # Remove all samples of the train set not belonging to the top classes chosen
        if params['KEEP_TOP_ANSWERS']:
            ds.keepTopOutputs('train', params['OUTPUTS_IDS_DATASET'][0],
                              params['OUTPUT_VOCABULARY_SIZE'])
        # Filter top K answers per question-image pair
        if params['FILTER_ANSWERS']:
            filter_k_frequent_answers(ds, params)

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATA_ROOT_PATH'])

    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATA_ROOT_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '.pkl')

    return ds
Beispiel #10
0
from keras_wrapper.dataset import Dataset, saveDataset
from data_engine.prepare_data import keep_n_captions
ds = Dataset('HB_dataset', 'HB', silence=False)
ds.setOutput('examples/ZhEnTrans/training.en',
             'train',
             type='text',
             id='target_text',
             tokenization='tokenize_none',
             build_vocabulary=True,
             pad_on_batch=True,
             sample_weights=True,
             max_text_len=30,
             max_words=30000,
             min_occ=0)

ds.setOutput('examples/ZhEnTrans/dev.en',
             'val',
             type='text',
             id='target_text',
             pad_on_batch=True,
             tokenization='tokenize_none',
             sample_weights=True,
             max_text_len=30,
             max_words=0)
ds.setInput('examples/ZhEnTrans/training.zh',
            'train',
            type='text',
            id='source_text',
            pad_on_batch=True,
            tokenization='tokenize_none',
            build_vocabulary=True,
Beispiel #11
0
def build_dataset(params):

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if (params['VERBOSE'] > 0):
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)

        ##### INPUT DATA
        # Let's load the images (inputs)

        ### IMAGES
        list_train = base_path + '/' + params['IMG_FILES']['train'][0]
        ds.setInput(list_train,
                    'train',
                    type='raw-image',
                    id=params['INPUTS_IDS_DATASET'][0],
                    img_size=params['IMG_SIZE'],
                    img_size_crop=params['IMG_CROP_SIZE'],
                    use_RGB=params['RGB'])
        if 'val' in params['IMG_FILES'] and params['IMG_FILES']['val']:
            list_val = base_path + '/' + params['IMG_FILES']['val'][0]
            ds.setInput(list_val,
                        'val',
                        type='raw-image',
                        id=params['INPUTS_IDS_DATASET'][0],
                        img_size=params['IMG_SIZE'],
                        img_size_crop=params['IMG_CROP_SIZE'],
                        use_RGB=params['RGB'])
        if 'test' in params['IMG_FILES'] and params['IMG_FILES']['test']:
            list_test = base_path + '/' + params['IMG_FILES']['test'][0]
            ds.setInput(list_test,
                        'test',
                        type='raw-image',
                        id=params['INPUTS_IDS_DATASET'][0],
                        img_size=params['IMG_SIZE'],
                        img_size_crop=params['IMG_CROP_SIZE'],
                        use_RGB=params['RGB'])

        # Train mean
        if params['MEAN_IMAGE']:
            # if params['NORMALIZE']:
            #    params['MEAN_IMAGE'] = [m / 255. for m in params['MEAN_IMAGE']]
            ds.setTrainMean(params['MEAN_IMAGE'],
                            params['INPUTS_IDS_DATASET'][0])
        else:
            ds.calculateTrainMean(params['INPUTS_IDS_DATASET'][0])

        ##### OUTPUT DATA
        if params['TYPE_OUT'] == '3DLabel':
            # Set list of classes (strings)
            ds.setClasses(base_path + '/' + params['CLASSES_PATH'],
                          params['OUTPUTS_IDS_DATASET'][0])
        elif params['TYPE_OUT'] == '3DSemanticLabel':
            # Set list of classes (strings)
            classes_names = []
            with open(base_path + '/' + params['CLASSES_PATH'], 'r') as file:
                for line in file:
                    line = line.rstrip('\n').split(',')[0]
                    classes_names.append(line)
            ds.setClasses(classes_names, params['OUTPUTS_IDS_DATASET'][0])
            ds.setSemanticClasses(base_path + '/' + params['CLASSES_PATH'],
                                  params['OUTPUTS_IDS_DATASET'][0])

        ### 3DLabels or 3DSemanticLabels
        ds.setOutput(base_path + '/' + params['IMG_FILES']['train'][1],
                     'train',
                     type=params['TYPE_OUT'],
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     associated_id_in=params['INPUTS_IDS_DATASET'][0],
                     num_poolings=params['NUM_MODEL_POOLINGS'])
        if 'val' in params['IMG_FILES'] and params['IMG_FILES']['val']:
            ds.setOutput(base_path + '/' + params['IMG_FILES']['val'][1],
                         'val',
                         type=params['TYPE_OUT'],
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         associated_id_in=params['INPUTS_IDS_DATASET'][0],
                         num_poolings=params['NUM_MODEL_POOLINGS'])
        if 'test' in params['IMG_FILES'] and params['IMG_FILES']['test']:
            ds.setOutput(base_path + '/' + params['IMG_FILES']['test'][1],
                         'test',
                         type=params['TYPE_OUT'],
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         associated_id_in=params['INPUTS_IDS_DATASET'][0],
                         num_poolings=params['NUM_MODEL_POOLINGS'])

        if params['DISCARD_CLASSES']:
            weights = np.ones((params['NUM_CLASSES'], ))
            for c in params['DISCARD_CLASSES']:
                weights[c] = 0.0
            ds.extra_variables['class_weights_' +
                               params['OUTPUTS_IDS_DATASET'][0]] = weights

        if params['WEIGHT_CLASSES']:
            weights = params['WEIGHT_CLASSES']
            ds.extra_variables['class_weights_' +
                               params['OUTPUTS_IDS_DATASET'][0]] = weights

        ### Single multi-label
        if params['APPLY_MULTILABEL_CLASSIFICATION']:
            n_classes = len(ds.classes[params['OUTPUTS_IDS_DATASET'][0]])
            multilabel = convert3DLabels2multilabel(
                base_path + '/' + params['IMG_FILES']['train'][1], n_classes)
            ds.setOutput(multilabel,
                         'train',
                         type='binary',
                         id=params['OUTPUTS_IDS_DATASET'][1])
            if 'val' in params['IMG_FILES'] and params['IMG_FILES']['val']:
                multilabel = convert3DLabels2multilabel(
                    base_path + '/' + params['IMG_FILES']['val'][1], n_classes)
                ds.setOutput(multilabel,
                             'val',
                             type='binary',
                             id=params['OUTPUTS_IDS_DATASET'][1])
            if 'test' in params['IMG_FILES'] and params['IMG_FILES']['test']:
                multilabel = convert3DLabels2multilabel(
                    base_path + '/' + params['IMG_FILES']['test'][1],
                    n_classes)
                ds.setOutput(multilabel,
                             'test',
                             type='binary',
                             id=params['OUTPUTS_IDS_DATASET'][1])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '.pkl')

    return ds
def build_dataset(params):
    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if (params['VERBOSE'] > 0):
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)

        # OUTPUT DATA
        # Let's load the train, val and test splits of the descriptions (outputs)
        #    the files include a description per line. In this dataset a variable number
        #    of descriptions per video are provided.
        ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
                     'train',
                     type=params['OUTPUTS_TYPES_DATASET'][0],
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'),
                     build_vocabulary=True,
                     pad_on_batch=params.get('PAD_ON_BATCH', True),
                     sample_weights=params.get('SAMPLE_WEIGHTS', True),
                     fill=params.get('FILL', 'end'),
                     max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                     max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                     min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0),
                     bpe_codes=params.get('BPE_CODES_PATH', None),
                     label_smoothing=params.get('LABEL_SMOOTHING', 0.))

        for split in ['val', 'test']:
            if params['DESCRIPTION_FILES'].get(split) is not None:
                ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES'][split],
                             split,
                             type=params['OUTPUTS_TYPES_DATASET'][0],
                             id=params['OUTPUTS_IDS_DATASET'][0],
                             pad_on_batch=params.get('PAD_ON_BATCH', True),
                             tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'),
                             sample_weights=params.get('SAMPLE_WEIGHTS', True),
                             max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                             max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                             bpe_codes=params.get('BPE_CODES_PATH', None),
                             label_smoothing=0.)


        # INPUT DATA
        # Let's load the associated videos (inputs)
        # we must take into account that in this dataset we have a different number of sentences per video,
        # for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list
        # containing the number of captions in each video.
        if params['LABELS_PER_SAMPLE'] == 0:
            num_captions_train = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['train'])
            num_captions_val = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['val'])
            num_captions_test = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['test'])
        else:
            num_captions_train = params['LABELS_PER_SAMPLE']
            num_captions_val = params['LABELS_PER_SAMPLE']
            num_captions_test = params['LABELS_PER_SAMPLE']

        for n_feat, feat_type in enumerate(params['FEATURE_NAMES']):
            for split, num_cap in zip(['train', 'val', 'test'],
                                      [num_captions_train, num_captions_val, num_captions_test]):
                list_files = base_path + '/' + params['FRAMES_LIST_FILES'][split] % feat_type

                if params['LABELS_PER_SAMPLE'] == 0:
                    counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][split] % feat_type
                    input_dataset = [list_files, counts_files]
                else:
                    input_dataset = list_files

                ds.setInput(input_dataset,
                            split,
                            type=params['INPUTS_TYPES_DATASET'][n_feat],
                            id=params['INPUTS_IDS_DATASET'][0],
                            repeat_set=num_cap,
                            # video-features parameters
                            max_video_len=params['NUM_FRAMES'],
                            # image-features parameters
                            feat_len=params['FEATURE_DIMENSION'],
                            # raw-image parameters
                            img_size=params['IMG_SIZE'],
                            img_size_crop=params['IMG_CROP_SIZE'],

                            )

        if len(params['INPUTS_IDS_DATASET']) > 1:
            ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
                        'train',
                        type=params['INPUTS_TYPES_DATASET'][-1],
                        id=params['INPUTS_IDS_DATASET'][-1],
                        required=False,
                        tokenization=params['TOKENIZATION_METHOD'],
                        pad_on_batch=True,
                        build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                        offset=1,
                        fill=params['FILL'],
                        max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                        max_words=params['OUTPUT_VOCABULARY_SIZE'],
                        bpe_codes=params.get('BPE_CODES_PATH', None),
                        min_occ=params['MIN_OCCURRENCES_OUTPUT_VOCAB'])

            ds.setInput(None, 'val', type='ghost',
                        id=params['INPUTS_IDS_DATASET'][-1], required=False)
            ds.setInput(None, 'test', type='ghost',
                        id=params['INPUTS_IDS_DATASET'][-1], required=False)

        # Process dataset for keeping only one caption per video and storing the rest in a dict() with the following format:
        #        ds.extra_variables[set_name][id_output][img_position] = [cap1, cap2, cap3, ..., capN]
        keep_n_captions(ds, repeat=[num_captions_val, num_captions_test], n=1,
                        set_names=['val', 'test'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl')

    return ds
def build_dataset_val_test(params):
    base_path = params['DATA_ROOT_PATH']
    name = params['DATASET_NAME']
    ds = Dataset(name, base_path, silence=False)

    # INPUT DATA
    ds.setInput(base_path + '/data/new_dishes_val.txt',
                'val',
                type='text',
                id=params['INPUTS_IDS_DATASET'][1],
                build_vocabulary=True,
                pad_on_batch=True,
                tokenization=params['TOKENIZATION_METHOD'],
                max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                min_occ=params['MIN_OCCURRENCES_VOCAB'])

    ds.setInput(base_path + '/data/new_dishes_test.txt',
                'test',
                type='text',
                id=params['INPUTS_IDS_DATASET'][1],
                build_vocabulary=True,
                pad_on_batch=True,
                tokenization=params['TOKENIZATION_METHOD'],
                max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                min_occ=params['MIN_OCCURRENCES_VOCAB'])

    # INPUT DATA
    ds.setInput(base_path + '/data/new_links_val.txt',
                'val',
                type='image-features',
                id=params['INPUTS_IDS_DATASET'][0],
                feat_len=params['IMG_FEAT_SIZE'])

    ds.setInput(base_path + '/data/new_links_test.txt',
                'test',
                type='image-features',
                id=params['INPUTS_IDS_DATASET'][0],
                feat_len=params['IMG_FEAT_SIZE'])

    # INPUT DATA
    ds.setInput(base_path + '/data/new_cnn_val.txt',
                'val',
                type='image-features',
                id=params['INPUTS_IDS_DATASET'][2],
                feat_len=params['CNN_SIZE'])

    ds.setInput(base_path + '/data/new_cnn_test.txt',
                'test',
                type='image-features',
                id=params['INPUTS_IDS_DATASET'][2],
                feat_len=params['CNN_SIZE'])

    # OUTPUT DATA
    ds.setOutput(base_path + '/data/new_outs_val.txt',
                 'val',
                 type='real',
                 id=params['OUTPUTS_IDS_DATASET'][0])

    ds.setOutput(base_path + '/data/new_outs_test.txt',
                 'test',
                 type='real',
                 id=params['OUTPUTS_IDS_DATASET'][0])
    return ds
Beispiel #14
0
def build_dataset(params):
    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if (params['VERBOSE'] > 0):
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence = True
        base_path = params['DATA_ROOT_PATH'] + '/'
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)

        ##### OUTPUT DATA
        # Let's load the train, val and test splits of the target language sentences (outputs)
        #    the files include a sentence per line.
        print params['CLASS_FILES']
        for split in params['CLASS_FILES'].keys():
            ds.setOutput(params['CLASS_FILES'][split],
                         split,
                         type='categorical',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         sample_weights=params['SAMPLE_WEIGHTS'])

        # INPUT DATA
        for split in params['TEXT_FILES'].keys():
            if split == 'train':
                build_vocabulary = True
            else:
                build_vocabulary = False
            for i in range(len(params['INPUTS_IDS_DATASET'])):
                ds.setInput(params['TEXT_FILES'][split][i],
                            split,
                            type='text',
                            id=params['INPUTS_IDS_DATASET'][i],
                            pad_on_batch=params['PAD_ON_BATCH'],
                            tokenization=params['TOKENIZATION_METHOD'],
                            build_vocabulary=build_vocabulary,
                            fill=params['FILL'],
                            max_text_len=params['MAX_INPUT_TEXT_LEN'],
                            max_words=params['INPUT_VOCABULARY_SIZE'],
                            min_occ=params['MIN_OCCURRENCES_VOCAB'])

        for i in range(len(params['INPUTS_IDS_DATASET'])):
            if 'semisupervised' in params['MODE']:
                ds.setInput(params['POOL_FILENAME'][i],
                            'test',
                            type='text',
                            id=params['INPUTS_IDS_DATASET'][i],
                            pad_on_batch=params['PAD_ON_BATCH'],
                            tokenization=params['TOKENIZATION_METHOD'],
                            fill=params['FILL'],
                            max_text_len=params['MAX_INPUT_TEXT_LEN'],
                            max_words=params['INPUT_VOCABULARY_SIZE'],
                            min_occ=params['MIN_OCCURRENCES_VOCAB'])

        keep_n_captions(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])


    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl')

    return ds
def start_training(use_gpu):

    ds = Dataset('tutorial_dataset', 'tutorial', silence=False)
    ds.setOutput(DATA_PATH + "train_y.txt",
                 'train',
                 type='text',
                 id='target_text',
                 tokenization='tokenize_basic',
                 build_vocabulary=True,
                 pad_on_batch=True,
                 sample_weights=True,
                 max_text_len=30,
                 max_words=30000,
                 min_occ=0)

    ds.setOutput(DATA_PATH + "val_y.txt",
                 'val',
                 type='text',
                 id='target_text',
                 pad_on_batch=True,
                 tokenization='tokenize_basic',
                 sample_weights=True,
                 max_text_len=30,
                 max_words=0)

    ds.setInput(DATA_PATH + "train_x.txt",
                'train',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                build_vocabulary=True,
                fill='end',
                max_text_len=30,
                max_words=30000,
                min_occ=0)

    ds.setInput(DATA_PATH + "val_x.txt",
                'val',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                fill='end',
                max_text_len=30,
                min_occ=0)

    ds.setInput(DATA_PATH + "train_y.txt",
                'train',
                type='text',
                id='state_below',
                required=False,
                tokenization='tokenize_basic',
                pad_on_batch=True,
                build_vocabulary='target_text',
                offset=1,
                fill='end',
                max_text_len=30,
                max_words=30000)

    ds.setInput(None, 'val', type='ghost', id='state_below', required=False)

    for split, input_text_filename in zip(
        ['train', 'val'],
        [DATA_PATH + "train_x.txt", DATA_PATH + "val_x.txt"]):
        ds.setRawInput(input_text_filename,
                       split,
                       type='file-name',
                       id='raw_source_text',
                       overwrite_split=True)
    """We also need to match the references with the inputs. Since we only have one reference per input sample, we set `repeat=1`."""

    keep_n_captions(ds, repeat=1, n=1, set_names=['val'])
    """Finally, we can save our dataset instance for using in other experiments:"""

    saveDataset(ds, MODEL_PATH + "/dataset")
    """## 2. Creating and training a Neural Translation Model
    Now, we'll create and train a Neural Machine Translation (NMT) model. Since there is a significant number of hyperparameters, we'll use the default ones, specified in the `config.py` file. Note that almost every hardcoded parameter is automatically set from config if we run  `main.py `.

    We'll create an `'AttentionRNNEncoderDecoder'` (a LSTM encoder-decoder with attention mechanism). Refer to the [`model_zoo.py`](https://github.com/lvapeab/nmt-keras/blob/master/nmt_keras/model_zoo.py) file for other models (e.g. Transformer). 

    So first, let's import the model and the hyperparameters. We'll also load the dataset we stored in the previous section (not necessary as it is in memory, but as a demonstration):
    """

    params = load_parameters()
    dataset = loadDataset(MODEL_PATH + "/dataset/Dataset_tutorial_dataset.pkl")
    """Since the number of words in the dataset may be unknown beforehand, we must update the params information according to the dataset instance:"""

    params['MODEL_TYPE'] = 'Transformer'
    params['USE_CUDNN'] = use_gpu
    params['EARLY_STOP'] = True
    params['PATIENCE'] = 10
    params['SAVE_EACH_EVALUATION'] = True
    params['STORE_PATH'] = MODEL_PATH
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['N_HEADS'] = 100
    params['POS_UNK'] = False  # current Transformer model requires this
    params[
        'ATTEND_ON_OUTPUT'] = True  # current Transformer model requires this
    params['MODEL_SIZE'] = 100
    params['SOURCE_TEXT_EMBEDDING_SIZE'] = 100
    params['TARGET_TEXT_EMBEDDING_SIZE'] = 100
    params['SKIP_VECTORS_HIDDEN_SIZE'] = 100
    params['ENCODER_HIDDEN_SIZE'] = 100
    params['DECODER_HIDDEN_SIZE'] = 100
    params['APPLY_DETOKENIZATION'] = True
    params['LENGTH_PENALTY'] = True
    params['LENGTH_NORM_FACTOR'] = 0.8
    params['MAX_INPUT_TEXT_LEN'] = 128
    params['MAX_OUTPUT_TEXT_LEN'] = 128
    params['STOP_METRIC'] = 'perplexity'
    params['BEAM_SIZE'] = 20
    params['N_GPUS'] = 2
    params['START_EVAL_ON_EPOCH'] = 1
    params['BATCH_SIZE'] = 128
    params['EVAL_EACH'] = 1
    params['MAX_EPOCH'] = 100
    params['PLOT_EVALULATION'] = True
    params['APPLY_DETOKENIZATION'] = True
    params['MODE'] = 'training'
    params['BEAM_SEARCH'] = True
    params['TENSORBOARD'] = True
    train_model(params,
                load_dataset=MODEL_PATH +
                "/dataset/Dataset_tutorial_dataset.pkl")
Beispiel #16
0
def start_training(use_gpu):

    ds = Dataset('tutorial_dataset', 'tutorial', silence=False)
    ds.setOutput(PATH + "train_correct.txt",
                 'train',
                 type='text',
                 id='target_text',
                 tokenization='tokenize_basic',
                 build_vocabulary=True,
                 pad_on_batch=True,
                 sample_weights=True,
                 max_text_len=100,
                 max_words=55000,
                 min_occ=1)

    ds.setOutput(PATH + "validation_correct.txt",
                 'val',
                 type='text',
                 id='target_text',
                 pad_on_batch=True,
                 tokenization='tokenize_basic',
                 sample_weights=True,
                 max_text_len=100,
                 max_words=0)

    ds.setInput(PATH + "train_error.txt",
                'train',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                build_vocabulary=True,
                fill='end',
                max_text_len=100,
                max_words=55000,
                min_occ=1)

    ds.setInput(PATH + "validation_error.txt",
                'val',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                fill='end',
                max_text_len=100,
                min_occ=1)
    """...and for the 'state_below' data. Note that: 1) The offset flat is set to 1, which means that the text will be shifted to the right 1 position. 2) During sampling time, we won't have this input. Hence, we 'hack' the dataset model by inserting an artificial input, of type 'ghost' for the validation split."""

    ds.setInput(PATH + "train_correct.txt",
                'train',
                type='text',
                id='state_below',
                required=False,
                tokenization='tokenize_basic',
                pad_on_batch=True,
                build_vocabulary='target_text',
                offset=1,
                fill='end',
                max_text_len=100,
                max_words=55000)
    ds.setInput(None, 'val', type='ghost', id='state_below', required=False)
    """We can also keep the literal source words (for replacing unknown words)."""

    for split, input_text_filename in zip(
        ['train', 'val'],
        [PATH + "train_error.txt", PATH + "validation_error.txt"]):
        ds.setRawInput(input_text_filename,
                       split,
                       type='file-name',
                       id='raw_source_text',
                       overwrite_split=True)
    """We also need to match the references with the inputs. Since we only have one reference per input sample, we set `repeat=1`."""

    keep_n_captions(ds, repeat=1, n=1, set_names=['val'])
    """Finally, we can save our dataset instance for using in other experiments:"""

    saveDataset(ds, PATH + "dataset")
    """## 2. Creating and training a Neural Translation Model
    Now, we'll create and train a Neural Machine Translation (NMT) model. Since there is a significant number of hyperparameters, we'll use the default ones, specified in the `config.py` file. Note that almost every hardcoded parameter is automatically set from config if we run  `main.py `.

    We'll create an `'AttentionRNNEncoderDecoder'` (a LSTM encoder-decoder with attention mechanism). Refer to the [`model_zoo.py`](https://github.com/lvapeab/nmt-keras/blob/master/nmt_keras/model_zoo.py) file for other models (e.g. Transformer). 

    So first, let's import the model and the hyperparameters. We'll also load the dataset we stored in the previous section (not necessary as it is in memory, but as a demonstration):
    """

    params = load_parameters()
    dataset = loadDataset(PATH + "dataset/Dataset_tutorial_dataset.pkl")
    """Since the number of words in the dataset may be unknown beforehand, we must update the params information according to the dataset instance:"""

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['source_text']
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['target_text']
    params['USE_CUDNN'] = use_gpu
    params['N_GPUS'] = 2
    params['MAX_EPOCH'] = 1000
    params['EARLY_STOP'] = True
    params['PATIENCE'] = 10
    params['SAVE_EACH_EVALUATION'] = True
    params['STORE_PATH'] = PATH + "model/"
    params['BATCH_SIZE'] = 128
    params['ATTENTION_MODE'] = "add"
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['SOURCE_TEXT_EMBEDDING_SIZE'] = 512
    params['TARGET_TEXT_EMBEDDING_SIZE'] = 512
    params['SKIP_VECTORS_HIDDEN_SIZE'] = 512
    params['ATTENTION_SIZE'] = 512
    params['ENCODER_HIDDEN_SIZE'] = 512
    params['DECODER_HIDDEN_SIZE'] = 512
    params['ENCODER_RNN_TYPE'] = "LSTM"
    params['DECODER_RNN_TYPE'] = "ConditionalLSTM"
    params['METRICS'] = ['coco']
    params['KERAS_METRICS'] = ['perplexity']
    params['APPLY_DETOKENIZATION'] = True
    params['LENGTH_PENALTY'] = True
    params['LENGTH_NORM_FACTOR'] = 1.0
    params['BEAM_SIZE'] = 1
    params['BEAM_SEARCH'] = True
    params['PLOT_EVALUATION'] = True
    params['MAX_PLOT_Y'] = 1.
    params['MODE'] = 'training'
    params['TENSORBOARD'] = True

    result = pyfiglet.figlet_format("START TRAINING FROM SCRATCH".format(mode),
                                    font="digital")
    print(result)
    train_model(params,
                load_dataset=os.getcwd() +
                "/dataset/Dataset_tutorial_dataset.pkl")
Beispiel #17
0
def build_dataset(params):
    
    if params['REBUILD_DATASET']: # We build a new dataset instance
        if(params['VERBOSE'] > 0):
            silence=False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence=True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)
        max_text_len = params['MAX_INPUT_TEXT_LEN']

        ##### INPUT DATA
        ### QUESTIONS
        ds.setInput(base_path+'/'+params['QST_FILES']['train'][0], 'train',
                   type='text', id=params['INPUTS_IDS_DATASET'][0],
                   tokenization=params['TOKENIZATION_METHOD'], build_vocabulary=True, fill=params['FILL'],
                   max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'],
                   repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path+'/'+params['QST_FILES']['val'][0], 'val',
                   type='text', id=params['INPUTS_IDS_DATASET'][0],
                   tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'],
                   max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'],
                   repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path+'/'+params['QST_FILES']['test'][0], 'test',
                   type='text', id=params['INPUTS_IDS_DATASET'][0],
                   tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'],
                   max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'],
                   repeat_set=params['REPEAT_QST'])
        ### QUESTIONS' associated IDs
        ds.setInput(base_path+'/'+params['QST_FILES']['train'][1], 'train',
                   type='id', id=params['INPUTS_IDS_DATASET'][0]+'_ids',
                   repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path+'/'+params['QST_FILES']['val'][1], 'val',
                   type='id', id=params['INPUTS_IDS_DATASET'][0]+'_ids',
                   repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path+'/'+params['QST_FILES']['test'][1], 'test',
                   type='id', id=params['INPUTS_IDS_DATASET'][0]+'_ids',
                   repeat_set=params['REPEAT_QST'])
        
        ### IMAGES
        ds.setInput(base_path+'/'+params['IMG_FILES']['train'][0], 'train',
                   type='image-features', id=params['INPUTS_IDS_DATASET'][1],
                   feat_len=params['IMG_FEAT_SIZE'],
                   repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path+'/'+params['IMG_FILES']['val'][0], 'val',
                   type='image-features', id=params['INPUTS_IDS_DATASET'][1],
                   feat_len=params['IMG_FEAT_SIZE'],
                   repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path+'/'+params['IMG_FILES']['test'][0], 'test',
                   type='image-features', id=params['INPUTS_IDS_DATASET'][1],
                   feat_len=params['IMG_FEAT_SIZE'],
                   repeat_set=params['REPEAT_IMG'])
        ### IMAGES' associated IDs
        ds.setInput(base_path+'/'+params['IMG_FILES']['train'][1], 'train',
                   type='id', id=params['INPUTS_IDS_DATASET'][1]+'_ids',
                   repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path+'/'+params['IMG_FILES']['val'][1], 'val',
                   type='id', id=params['INPUTS_IDS_DATASET'][1]+'_ids',
                   repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path+'/'+params['IMG_FILES']['test'][1], 'test',
                   type='id', id=params['INPUTS_IDS_DATASET'][1]+'_ids',
                   repeat_set=params['REPEAT_IMG'])
        

        ##### OUTPUT DATA
        ### ANSWERS
        ds.setOutput(base_path+'/'+params['ANS_FILES']['train'][0], 'train',
                   type='text', id=params['OUTPUTS_IDS_DATASET'][0],
                   tokenization=params['TOKENIZATION_METHOD'], build_vocabulary=True, fill=params['FILL'],
                   max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE'])
        ds.setOutput(base_path+'/'+params['ANS_FILES']['val'][0], 'val',
                   type='text', id=params['OUTPUTS_IDS_DATASET'][0],
                   tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'],
                   max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE'])
        if 'test' in params['ANS_FILES']:
            ds.setOutput(base_path+'/'+params['ANS_FILES']['test'][0], 'test',
                       type='text', id=params['OUTPUTS_IDS_DATASET'][0],
                       tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'],
                       max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE'])

        
        # Load extra variables (we need the original path to questions and annotations for VQA evaluation)
        ds.extra_variables['train'] = dict()
        ds.extra_variables['val'] = dict()
        ds.extra_variables['test'] = dict()
        
        ds.extra_variables['train']['quesFile'] = base_path+'/'+params['QST_FILES']['train'][2]
        ds.extra_variables['val']['quesFile'] = base_path+'/'+params['QST_FILES']['val'][2]
        ds.extra_variables['test']['quesFile'] = base_path+'/'+params['QST_FILES']['test'][2]
        
        ds.extra_variables['train']['annFile'] = base_path+'/'+params['ANS_FILES']['train'][1]
        ds.extra_variables['val']['annFile'] = base_path+'/'+params['ANS_FILES']['val'][1]
        if 'test' in params['ANS_FILES']:
            ds.extra_variables['test']['annFile'] = base_path+'/'+params['ANS_FILES']['tes'][1]
        
        
        # Remove all samples of the train set not belonging to the top classes chosen
        if params['KEEP_TOP_ANSWERS']:
            ds.keepTopOutputs('train', params['OUTPUTS_IDS_DATASET'][0], params['OUTPUT_VOCABULARY_SIZE'])
        # Filter top K answers per question-image pair
        if params['FILTER_ANSWERS']:
            filter_k_frequent_answers(ds, params)
        
        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATA_ROOT_PATH'])
    
    
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATA_ROOT_PATH']+'/Dataset_'+params['DATASET_NAME']+'.pkl')

    return ds
Beispiel #18
0
def build_dataset(params, vocabulary=dict(), vocabulary_len=dict()):
    """
    Builds (or loads) a Dataset instance.
    :param params: Parameters specifying Dataset options
    :return: Dataset object
    """

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if params['VERBOSE'] > 0:
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + '_' +
                         params['SRC_LAN'] + params['TRG_LAN'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[
            'TRG_LAN']
        doc_size = 0
        if 'SECOND_DIM_SIZE' in params:
            doc_size = params['SECOND_DIM_SIZE']
        ds = Dataset(name,
                     base_path,
                     silence=silence,
                     vocabulary=vocabulary,
                     vocabulary_len=vocabulary_len,
                     doc_size=doc_size)
        # OUTPUT DATA
        # Let's load the train, val and test splits of the target language sentences (outputs)
        #    the files include a sentence per line.

        if params['MODEL_TYPE'] == 'Predictor':
            if 'PRED_VOCAB' in params:
                ds.setOutput(
                    base_path + '/' + params['TEXT_FILES']['train'] +
                    params['TRG_LAN'],
                    'train',
                    type='text',
                    id=params['OUTPUTS_IDS_DATASET'][0],
                    tokenization=params.get('TOKENIZATION_METHOD',
                                            'tokenize_none'),
                    # if you want new vocabulary set build_vocabulary to True
                    build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                    pad_on_batch=params.get('PAD_ON_BATCH', True),
                    sample_weights=params.get('SAMPLE_WEIGHTS', True),
                    fill=params.get('FILL', 'end'),
                    max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                    max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                    min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0),
                    bpe_codes=params.get('BPE_CODES_PATH', None))
            else:
                ds.setOutput(
                    base_path + '/' + params['TEXT_FILES']['train'] +
                    params['TRG_LAN'],
                    'train',
                    type='text',
                    id=params['OUTPUTS_IDS_DATASET'][0],
                    tokenization=params.get('TOKENIZATION_METHOD',
                                            'tokenize_none'),
                    # if you want new vocabulary set build_vocabulary to True
                    build_vocabulary=True,
                    pad_on_batch=params.get('PAD_ON_BATCH', True),
                    sample_weights=params.get('SAMPLE_WEIGHTS', True),
                    fill=params.get('FILL', 'end'),
                    max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                    max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                    min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0),
                    bpe_codes=params.get('BPE_CODES_PATH', None))

        elif params['MODEL_TYPE'] == 'EstimatorSent' or params[
                'MODEL_TYPE'] == 'EncSent' or 'EstimatorDoc' in params[
                    'MODEL_TYPE'] or 'EncDoc' in params['MODEL_TYPE']:

            ds.setOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                         params['PRED_SCORE'],
                         'train',
                         type='real',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         tokenization=params.get('TOKENIZATION_METHOD',
                                                 'tokenize_none'),
                         build_vocabulary=False,
                         pad_on_batch=params.get('PAD_ON_BATCH', False),
                         sample_weights=params.get('SAMPLE_WEIGHTS', False),
                         fill=params.get('FILL', 'end'),
                         max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                         max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                         min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0),
                         bpe_codes=params.get('BPE_CODES_PATH', None))

        elif params['MODEL_TYPE'] == 'EstimatorWord' or params[
                'MODEL_TYPE'] == 'EncWord' or params[
                    'MODEL_TYPE'] == 'EncWordAtt' or params[
                        'MODEL_TYPE'] == 'EncPhraseAtt' or params[
                            'MODEL_TYPE'] == 'EstimatorPhrase':

            ds.setOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                         params['PRED_SCORE'],
                         'train',
                         type='text',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         tokenization=params.get('TOKENIZATION_METHOD',
                                                 'tokenize_none'),
                         build_vocabulary=True,
                         pad_on_batch=params.get('PAD_ON_BATCH', True),
                         sample_weights=params.get('SAMPLE_WEIGHTS', False),
                         fill=params.get('FILL', 'end'),
                         max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                         max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                         min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0),
                         bpe_codes=params.get('BPE_CODES_PATH', None))

        if params.get('ALIGN_FROM_RAW',
                      True) and not params.get('HOMOGENEOUS_BATCHES', False):
            ds.setRawOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                            params['TRG_LAN'],
                            'train',
                            type='file-name',
                            id='raw_' + params['OUTPUTS_IDS_DATASET'][0])

        val_test_list = params.get('EVAL_ON_SETS', ['val'])
        no_ref = params.get('NO_REF', False)
        if no_ref:
            val_test_list = []
        for split in val_test_list:
            if params['TEXT_FILES'].get(split) is not None:

                if params['MODEL_TYPE'] == 'Predictor':

                    ds.setOutput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['TRG_LAN'],
                        split,
                        type='text',
                        id=params['OUTPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        sample_weights=params.get('SAMPLE_WEIGHTS', True),
                        max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                        max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))

                elif params['MODEL_TYPE'] == 'EstimatorSent' or params[
                        'MODEL_TYPE'] == 'EncSent' or 'EstimatorDoc' in params[
                            'MODEL_TYPE'] or 'EncDoc' in params['MODEL_TYPE']:

                    ds.setOutput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['PRED_SCORE'],
                        split,
                        type='real',
                        id=params['OUTPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        sample_weights=params.get('SAMPLE_WEIGHTS', False),
                        max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                        max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))

                elif params['MODEL_TYPE'] == 'EstimatorWord' or params[
                        'MODEL_TYPE'] == 'EncWord' or params[
                            'MODEL_TYPE'] == 'EncWordAtt' or params[
                                'MODEL_TYPE'] == 'EncPhraseAtt' or params[
                                    'MODEL_TYPE'] == 'EstimatorPhrase':

                    ds.setOutput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['PRED_SCORE'],
                        split,
                        type='text',
                        id=params['OUTPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        sample_weights=params.get('SAMPLE_WEIGHTS', False),
                        max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                        max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))

                if params.get('ALIGN_FROM_RAW', True) and not params.get(
                        'HOMOGENEOUS_BATCHES', False):
                    ds.setRawOutput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['TRG_LAN'],
                        split,
                        type='file-name',
                        id='raw_' + params['OUTPUTS_IDS_DATASET'][0])

        # INPUT DATA
        # We must ensure that the 'train' split is the first (for building the vocabulary)

        max_src_in_len = params.get('MAX_SRC_INPUT_TEXT_LEN', None)
        if max_src_in_len == None:
            params['MAX_SRC_INPUT_TEXT_LEN'] = params['MAX_INPUT_TEXT_LEN']

        max_trg_in_len = params.get('MAX_TRG_INPUT_TEXT_LEN', None)
        if max_trg_in_len == None:
            params['MAX_TRG_INPUT_TEXT_LEN'] = params['MAX_INPUT_TEXT_LEN']

        data_type_src = 'text'
        data_type_trg = 'text'

        if 'EstimatorDoc' in params['MODEL_TYPE'] or 'EncDoc' in params[
                'MODEL_TYPE']:
            data_type_src = 'doc'
            data_type_trg = 'doc'

        # here we set to doc meaning just the 3d input
        if params['MODEL_TYPE'] == 'EstimatorPhrase' or params[
                'MODEL_TYPE'] == 'EncPhraseAtt':
            data_type_trg = 'doc'

        ext = params['TRG_LAN']
        target_dict = 'target_text'

        #if params['MODEL_TYPE'] != 'Predictor':
        #    ext = 'mt'

        for split in ['train', 'val', 'test']:
            if params['TEXT_FILES'].get(split) is not None:
                if split == 'train':
                    build_vocabulary = True
                else:
                    build_vocabulary = False
                if 'PRED_VOCAB' in params:

                    ds.setInput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['SRC_LAN'],
                        split,
                        type=data_type_src,
                        id=params['INPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        build_vocabulary=params['INPUTS_IDS_DATASET'][0],
                        fill=params.get('FILL', 'end'),
                        max_text_len=params.get('MAX_SRC_INPUT_TEXT_LEN', 70),
                        max_words=params.get('INPUT_VOCABULARY_SIZE', 0),
                        min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))
                else:

                    ds.setInput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['SRC_LAN'],
                        split,
                        type=data_type_src,
                        id=params['INPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        build_vocabulary=build_vocabulary,
                        fill=params.get('FILL', 'end'),
                        max_text_len=params.get('MAX_SRC_INPUT_TEXT_LEN', 70),
                        max_words=params.get('INPUT_VOCABULARY_SIZE', 0),
                        min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))

                if len(params['INPUTS_IDS_DATASET']) == 2:
                    if 'PRED_VOCAB' not in params and 'train' in split:

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][1],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=build_vocabulary,
                            offset=0,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                    else:
                        # ds.setInput(None,
                        #             split,
                        #             type='ghost',
                        #             id=params['INPUTS_IDS_DATASET'][-1],
                        #             required=False)

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][1],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=0,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                if len(params['INPUTS_IDS_DATASET']) > 2:
                    if 'PRED_VOCAB' not in params and 'train' in split:

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][1],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=build_vocabulary,
                            offset=1,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][2],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=-1,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][3],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=0,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                    else:
                        # ds.setInput(None,
                        #             split,
                        #             type='ghost',
                        #             id=params['INPUTS_IDS_DATASET'][-1],
                        #             required=False)

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][1],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=1,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][2],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=-1,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][3],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=0,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                if params.get('ALIGN_FROM_RAW', True) and not params.get(
                        'HOMOGENEOUS_BATCHES', False):
                    ds.setRawInput(base_path + '/' +
                                   params['TEXT_FILES'][split] +
                                   params['SRC_LAN'],
                                   split,
                                   type='file-name',
                                   id='raw_' + params['INPUTS_IDS_DATASET'][0])

        if params.get('POS_UNK', False):
            if params.get('HEURISTIC', 0) > 0:
                ds.loadMapping(params['MAPPING'])

        # If we had multiple references per sentence
        if not params.get('NO_REF', False):
            keep_n_captions(ds,
                            repeat=1,
                            n=1,
                            set_names=params['EVAL_ON_SETS'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])

    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '_' + params['SRC_LAN'] +
                         params['TRG_LAN'] + '.pkl')

    return ds
Beispiel #19
0
def build_dataset(params):
    
    if params['REBUILD_DATASET']: # We build a new dataset instance
        if(params['VERBOSE'] > 0):
            silence=False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence=True

        base_path = params['DATA_ROOT_PATH']
        ds = Dataset(params['DATASET_NAME'], base_path+params.get('SUFFIX_DATASET', '/images'), silence=silence)

        ##### INPUT DATA
        ### IMAGES
        ds.setInput(base_path+'/'+params['IMG_FILES']['train'], 'train',
                   type='raw-image', id=params['INPUTS_IDS_DATASET'][0],
                   img_size=params['IMG_SIZE'], img_size_crop=params['IMG_SIZE_CROP'])
        ds.setInput(base_path+'/'+params['IMG_FILES']['val'], 'val',
                   type='raw-image', id=params['INPUTS_IDS_DATASET'][0],
                   img_size=params['IMG_SIZE'], img_size_crop=params['IMG_SIZE_CROP'])
        ds.setInput(base_path+'/'+params['IMG_FILES']['test'], 'test',
                   type='raw-image', id=params['INPUTS_IDS_DATASET'][0],
                   img_size=params['IMG_SIZE'], img_size_crop=params['IMG_SIZE_CROP'])
        # Set train mean
        ds.setTrainMean(mean_image=params['MEAN_IMAGE'], id=params['INPUTS_IDS_DATASET'][0])

        ##### OUTPUT DATA
        if params['CLASSIFICATION_TYPE'] == 'single-label':

            # train split
            ds.setOutput(base_path + '/' + params['LABELS_FILES']['train'], 'train',
                         type='categorical', id=params['OUTPUTS_IDS_DATASET'][0])
            # val split
            ds.setOutput(base_path + '/' + params['LABELS_FILES']['val'], 'val',
                         type='categorical', id=params['OUTPUTS_IDS_DATASET'][0])
            # test split
            ds.setOutput(base_path + '/' + params['LABELS_FILES']['test'], 'test',
                         type='categorical', id=params['OUTPUTS_IDS_DATASET'][0])

        elif params['CLASSIFICATION_TYPE'] == 'multi-label':

            # Convert list of ingredients into classes
            logging.info('Preprocessing list of ingredients for assigning vocabulary as image classes.')
            [classes, word2idx, idx2word] = convertIngredientsList2BinaryClasses(base_path,
                                                                                 params['LABELS_FILES'],
                                                                                 params['CLASSES_PATH'],
                                                                                 type_list=params.get('LABELS_TYPE_LIST', 'identifiers'))
            # Insert them as outputs
            ds.setOutput(classes['train'], 'train', type='binary', id=params['OUTPUTS_IDS_DATASET'][0])
            ds.setOutput(classes['val'], 'val', type='binary', id=params['OUTPUTS_IDS_DATASET'][0])
            ds.setOutput(classes['test'], 'test', type='binary', id=params['OUTPUTS_IDS_DATASET'][0])

            # Insert vocabularies
            ds.extra_variables['word2idx_binary'] = word2idx
            ds.extra_variables['idx2word_binary'] = idx2word
            
            if 'Food_and_Ingredients' in params['DATASET_NAME']:
                
                # train split
                ds.setOutput(base_path + '/' + params['LABELS_FILES_FOOD']['train'], 'train',
                             type='categorical', id=params['OUTPUTS_IDS_DATASET'][1])
                # val split
                ds.setOutput(base_path + '/' + params['LABELS_FILES_FOOD']['val'], 'val',
                             type='categorical', id=params['OUTPUTS_IDS_DATASET'][1])
                # test split
                ds.setOutput(base_path + '/' + params['LABELS_FILES_FOOD']['test'], 'test',
                             type='categorical', id=params['OUTPUTS_IDS_DATASET'][1])


        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['STORE_PATH'])
    
    
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['STORE_PATH']+'/Dataset_'+params['DATASET_NAME']+'.pkl')

    return ds
def build_dataset(params):
    if params['REBUILD_DATASET']:

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=False)

        # INPUT DATA
        ds.setInput(base_path + '/' + params['DISHES_FILES']['train'],
                    'train',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][1],
                    build_vocabulary=True,
                    tokenization=params['TOKENIZATION_METHOD'],
                    fill=params['FILL'],
                    pad_on_batch=True,
                    max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                    min_occ=params['MIN_OCCURRENCES_VOCAB'])

        ds.setInput(base_path + '/' + params['DISHES_FILES']['val'],
                    'val',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][1],
                    build_vocabulary=True,
                    pad_on_batch=True,
                    tokenization=params['TOKENIZATION_METHOD'],
                    max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                    min_occ=params['MIN_OCCURRENCES_VOCAB'])

        ds.setInput(base_path + '/' + params['DISHES_FILES']['test'],
                    'test',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][1],
                    build_vocabulary=True,
                    pad_on_batch=True,
                    tokenization=params['TOKENIZATION_METHOD'],
                    max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                    min_occ=params['MIN_OCCURRENCES_VOCAB'])

        # INPUT DATA
        ds.setInput(base_path + '/' + params['IMAGES_LIST_FILES']['train'],
                    'train',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][0],
                    feat_len=params['IMG_FEAT_SIZE'])

        ds.setInput(base_path + '/' + params['IMAGES_LIST_FILES']['val'],
                    'val',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][0],
                    feat_len=params['IMG_FEAT_SIZE'])

        ds.setInput(base_path + '/' + params['IMAGES_LIST_FILES']['test'],
                    'test',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][0],
                    feat_len=params['IMG_FEAT_SIZE'])

        # INPUT DATA
        ds.setInput(base_path + '/' + params['CNN_FILES']['train'],
                    'train',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][2],
                    feat_len=params['CNN_SIZE'])

        ds.setInput(base_path + '/' + params['CNN_FILES']['val'],
                    'val',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][2],
                    feat_len=params['CNN_SIZE'])

        ds.setInput(base_path + '/' + params['CNN_FILES']['test'],
                    'test',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][2],
                    feat_len=params['CNN_SIZE'])

        # OUTPUT DATA
        if "sample_weight" not in params or params['sample_weight']:
            ds.setOutput(base_path + '/' + params['OUT_FILES']['train'],
                         'train',
                         type='real',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         sample_weights=np.load(Path.DATA_FOLDER +
                                                "/data/weights.npy"))
        else:
            ds.setOutput(base_path + '/' + params['OUT_FILES']['train'],
                         'train',
                         type='real',
                         id=params['OUTPUTS_IDS_DATASET'][0])

        ds.setOutput(base_path + '/' + params['OUT_FILES']['val'],
                     'val',
                     type='real',
                     id=params['OUTPUTS_IDS_DATASET'][0])

        ds.setOutput(base_path + '/' + params['OUT_FILES']['test'],
                     'test',
                     type='real',
                     id=params['OUTPUTS_IDS_DATASET'][0])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '.pkl')
    return ds
def build_dataset(params):

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if (params['VERBOSE'] > 0):
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)

        ##### OUTPUT DATA
        # Let's load the train, val and test splits of the descriptions (outputs)
        #    the files include a description per line. In this dataset a variable number
        #    of descriptions per video are provided.
        ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
                     'train',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     build_vocabulary=True,
                     tokenization=params['TOKENIZATION_METHOD'],
                     fill=params['FILL'],
                     pad_on_batch=True,
                     max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                     sample_weights=params['SAMPLE_WEIGHTS'],
                     min_occ=params['MIN_OCCURRENCES_VOCAB'])

        ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['val'],
                     'val',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     build_vocabulary=True,
                     pad_on_batch=True,
                     tokenization=params['TOKENIZATION_METHOD'],
                     sample_weights=params['SAMPLE_WEIGHTS'],
                     max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                     min_occ=params['MIN_OCCURRENCES_VOCAB'])

        ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['test'],
                     'test',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     build_vocabulary=True,
                     pad_on_batch=True,
                     tokenization=params['TOKENIZATION_METHOD'],
                     sample_weights=params['SAMPLE_WEIGHTS'],
                     max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                     min_occ=params['MIN_OCCURRENCES_VOCAB'])

        ##### INPUT DATA
        # Let's load the associated videos (inputs)
        #    we must take into account that in this dataset we have a different number of sentences per video,
        #    for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list
        #    containing the number of captions in each video.

        num_captions_train = np.load(
            base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['train'])
        num_captions_val = np.load(base_path + '/' +
                                   params['DESCRIPTION_COUNTS_FILES']['val'])
        num_captions_test = np.load(base_path + '/' +
                                    params['DESCRIPTION_COUNTS_FILES']['test'])

        for feat_type in params['FEATURE_NAMES']:
            for split, num_cap in zip(
                ['train', 'val', 'test'],
                [num_captions_train, num_captions_val, num_captions_test]):
                list_files = base_path + '/' + params['FRAMES_LIST_FILES'][
                    split] % feat_type
                counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][
                    split] % feat_type

                ds.setInput([list_files, counts_files],
                            split,
                            type=params['INPUT_DATA_TYPE'],
                            id=params['INPUTS_IDS_DATASET'][0],
                            repeat_set=num_cap,
                            max_video_len=params['NUM_FRAMES'],
                            feat_len=params['IMG_FEAT_SIZE'])

        if len(params['INPUTS_IDS_DATASET']) > 1:
            ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
                        'train',
                        type='text',
                        id=params['INPUTS_IDS_DATASET'][-1],
                        required=False,
                        tokenization=params['TOKENIZATION_METHOD'],
                        pad_on_batch=True,
                        build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                        offset=1,
                        fill=params['FILL'],
                        max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                        max_words=params['OUTPUT_VOCABULARY_SIZE'],
                        min_occ=params['MIN_OCCURRENCES_VOCAB'])

            ds.setInput(None,
                        'val',
                        type='ghost',
                        id=params['INPUTS_IDS_DATASET'][-1],
                        required=False)
            ds.setInput(None,
                        'test',
                        type='ghost',
                        id=params['INPUTS_IDS_DATASET'][-1],
                        required=False)

        # Process dataset for keeping only one caption per video and storing the rest in a dict() with the following format:
        #        ds.extra_variables[set_name][id_output][img_position] = [cap1, cap2, cap3, ..., capN]
        keep_n_captions(ds,
                        repeat=[num_captions_val, num_captions_test],
                        n=1,
                        set_names=['val', 'test'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '.pkl')

    return ds
Beispiel #22
0
def build_dataset(params):
    """
    Builds (or loads) a Dataset instance.
    :param params: Parameters specifying Dataset options
    :return: Dataset object
    """

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if params['VERBOSE'] > 0:
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + '_' +
                         params['SRC_LAN'] + params['TRG_LAN'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[
            'TRG_LAN']
        ds = Dataset(name, base_path, silence=silence)

        # OUTPUT DATA
        # Let's load the train, val and test splits of the target language sentences (outputs)
        #    the files include a sentence per line.
        ds.setOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                     params['TRG_LAN'],
                     'train',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     tokenization=params.get('TOKENIZATION_METHOD',
                                             'tokenize_none'),
                     build_vocabulary=True,
                     pad_on_batch=params.get('PAD_ON_BATCH', True),
                     sample_weights=params.get('SAMPLE_WEIGHTS', True),
                     fill=params.get('FILL', 'end'),
                     max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                     max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                     min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0))
        if params.get('ALIGN_FROM_RAW',
                      True) and not params.get('HOMOGENEOUS_BATCHES', False):
            ds.setRawOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                            params['TRG_LAN'],
                            'train',
                            type='file-name',
                            id='raw_' + params['OUTPUTS_IDS_DATASET'][0])

        for split in ['val', 'test']:
            if params['TEXT_FILES'].get(split) is not None:
                ds.setOutput(base_path + '/' + params['TEXT_FILES'][split] +
                             params['TRG_LAN'],
                             split,
                             type='text',
                             id=params['OUTPUTS_IDS_DATASET'][0],
                             pad_on_batch=params.get('PAD_ON_BATCH', True),
                             tokenization=params.get('TOKENIZATION_METHOD',
                                                     'tokenize_none'),
                             sample_weights=params.get('SAMPLE_WEIGHTS', True),
                             max_text_len=params.get('MAX_OUTPUT_TEXT_LEN',
                                                     70),
                             max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0))
                if params.get('ALIGN_FROM_RAW', True) and not params.get(
                        'HOMOGENEOUS_BATCHES', False):
                    ds.setRawOutput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['TRG_LAN'],
                        split,
                        type='file-name',
                        id='raw_' + params['OUTPUTS_IDS_DATASET'][0])

        # INPUT DATA
        # We must ensure that the 'train' split is the first (for building the vocabulary)
        for split in ['train', 'val', 'test']:
            if params['TEXT_FILES'].get(split) is not None:
                if split == 'train':
                    build_vocabulary = True
                else:
                    build_vocabulary = False
                ds.setInput(base_path + '/' + params['TEXT_FILES'][split] +
                            params['SRC_LAN'],
                            split,
                            type='text',
                            id=params['INPUTS_IDS_DATASET'][0],
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            build_vocabulary=build_vocabulary,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_INPUT_TEXT_LEN', 70),
                            max_words=params.get('INPUT_VOCABULARY_SIZE', 0),
                            min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB',
                                               0))

                if len(params['INPUTS_IDS_DATASET']) > 1:
                    if 'train' in split:
                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            params['TRG_LAN'],
                            split,
                            type='text',
                            id=params['INPUTS_IDS_DATASET'][1],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                            offset=1,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0))
                    else:
                        ds.setInput(None,
                                    split,
                                    type='ghost',
                                    id=params['INPUTS_IDS_DATASET'][-1],
                                    required=False)
                if params.get('ALIGN_FROM_RAW', True) and not params.get(
                        'HOMOGENEOUS_BATCHES', False):
                    ds.setRawInput(base_path + '/' +
                                   params['TEXT_FILES'][split] +
                                   params['SRC_LAN'],
                                   split,
                                   type='file-name',
                                   id='raw_' + params['INPUTS_IDS_DATASET'][0])

        if params.get('POS_UNK', False):
            if params.get('HEURISTIC', 0) > 0:
                ds.loadMapping(params['MAPPING'])

        # If we had multiple references per sentence
        keep_n_captions(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])

    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '_' + params['SRC_LAN'] +
                         params['TRG_LAN'] + '.pkl')

    return ds
Beispiel #23
0
def build_dataset(params):
    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if params['VERBOSE'] > 0:
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)

        if not '-vidtext-embed' in params['DATASET_NAME']:
            # OUTPUT DATA
            # Let's load the train, val and test splits of the descriptions (outputs)
            #    the files include a description per line. In this dataset a variable number
            #    of descriptions per video are provided.
            ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
                         'train',
                         type='text',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         build_vocabulary=True,
                         tokenization=params['TOKENIZATION_METHOD'],
                         fill=params['FILL'],
                         pad_on_batch=True,
                         max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                         sample_weights=params['SAMPLE_WEIGHTS'],
                         min_occ=params['MIN_OCCURRENCES_VOCAB'])

            ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['val'],
                         'val',
                         type='text',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         build_vocabulary=True,
                         pad_on_batch=True,
                         tokenization=params['TOKENIZATION_METHOD'],
                         sample_weights=params['SAMPLE_WEIGHTS'],
                         max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                         min_occ=params['MIN_OCCURRENCES_VOCAB'])

            ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['test'],
                         'test',
                         type='text',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         build_vocabulary=True,
                         pad_on_batch=True,
                         tokenization=params['TOKENIZATION_METHOD'],
                         sample_weights=params['SAMPLE_WEIGHTS'],
                         max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                         min_occ=params['MIN_OCCURRENCES_VOCAB'])

        else:
            # Use descriptions as inputs instead --> 'matching'/'non-matching' as output
            ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
                        'train',
                        type='text',
                        id=params['INPUTS_IDS_DATASET'][1],
                        build_vocabulary=True,
                        tokenization=params['TOKENIZATION_METHOD'],
                        fill=params['FILL'],
                        pad_on_batch=True,
                        max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                        min_occ=params['MIN_OCCURRENCES_VOCAB'])

            ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['val'],
                        'val',
                        type='text',
                        id=params['INPUTS_IDS_DATASET'][1],
                        build_vocabulary=True,
                        pad_on_batch=True,
                        tokenization=params['TOKENIZATION_METHOD'],
                        max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                        min_occ=params['MIN_OCCURRENCES_VOCAB'])

            ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['test'],
                        'test',
                        type='text',
                        id=params['INPUTS_IDS_DATASET'][1],
                        build_vocabulary=True,
                        pad_on_batch=True,
                        tokenization=params['TOKENIZATION_METHOD'],
                        max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                        min_occ=params['MIN_OCCURRENCES_VOCAB'])

        # INPUT DATA
        # Let's load the associated videos (inputs)
        #    we must take into account that in this dataset we have a different number of sentences per video, 
        #    for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list
        #    containing the number of captions in each video.

        num_captions_train = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['train'])
        num_captions_val = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['val'])
        num_captions_test = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['test'])

        for feat_type in params['FEATURE_NAMES']:
            for split, num_cap in zip(['train', 'val', 'test'],
                                      [num_captions_train, num_captions_val, num_captions_test]):
                list_files = base_path + '/' + params['FRAMES_LIST_FILES'][split] % feat_type
                counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][split] % feat_type

                ds.setInput([list_files, counts_files],
                            split,
                            type=params['INPUT_DATA_TYPE'],
                            id=params['INPUTS_IDS_DATASET'][0],
                            repeat_set=num_cap,
                            max_video_len=params['NUM_FRAMES'],
                            feat_len=params['IMG_FEAT_SIZE'],
                            data_augmentation_types=params['DATA_AUGMENTATION_TYPE'])

        if not '-vidtext-embed' in params['DATASET_NAME'] and len(params['INPUTS_IDS_DATASET']) > 1:
            ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
                        'train',
                        type='text',
                        id=params['INPUTS_IDS_DATASET'][1],
                        required=False,
                        tokenization=params['TOKENIZATION_METHOD'],
                        pad_on_batch=True,
                        build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                        offset=1,
                        fill=params['FILL'],
                        max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                        max_words=params['OUTPUT_VOCABULARY_SIZE'],
                        min_occ=params['MIN_OCCURRENCES_VOCAB'])

            ds.setInput(None, 'val', type='ghost', id=params['INPUTS_IDS_DATASET'][1], required=False)
            ds.setInput(None, 'test', type='ghost', id=params['INPUTS_IDS_DATASET'][1], required=False)

        # Set inputs for temporally-linked samples
        if not '-vidtext-embed' in params['DATASET_NAME'] and '-linked' in params['DATASET_NAME']:
            # Set input captions from previous event/video
            if '-upperbound' not in params['DATASET_NAME']:
                if '-vidtext' in params['DATASET_NAME']:  # use both previous video and previous description

                    ds, repeat_images = insertTemporallyLinkedCaptionsVidText(ds, params,
                                                                              vidtext_set_names={
                                                                                  'video': ['train', 'val', 'test'],
                                                                                  'text': ['train']})
                    del repeat_images['test']
                    del repeat_images['val']
                    # Insert empty prev_descriptions on val and test sets
                    ds.setInput([],
                                'val',
                                type='text',
                                id=params['INPUTS_IDS_DATASET'][2],
                                build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                                tokenization=params['TOKENIZATION_METHOD'],
                                fill=params['FILL'],
                                pad_on_batch=True,
                                max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                                min_occ=params['MIN_OCCURRENCES_VOCAB'],
                                required=False,
                                overwrite_split=True)
                    ds.setInput([],
                                'test',
                                type='text',
                                id=params['INPUTS_IDS_DATASET'][2],
                                build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                                tokenization=params['TOKENIZATION_METHOD'],
                                fill=params['FILL'],
                                pad_on_batch=True,
                                max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                                min_occ=params['MIN_OCCURRENCES_VOCAB'],
                                required=False,
                                overwrite_split=True)

                elif '-video' in params['DATASET_NAME']:
                    ds, repeat_images = insertTemporallyLinkedCaptions(ds, params,
                                                                       set_names=['train', 'val', 'test'],
                                                                       video=True)
                    num_captions_val = repeat_images['val']
                    num_captions_test = repeat_images['test']
                else:
                    ds, repeat_images = insertTemporallyLinkedCaptions(ds, params)
                    # Insert empty prev_descriptions on val and test sets
                    ds.setInput([],
                                'val',
                                type='text',
                                id=params['INPUTS_IDS_DATASET'][2],
                                build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                                tokenization=params['TOKENIZATION_METHOD'],
                                fill=params['FILL'],
                                pad_on_batch=True,
                                max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                                min_occ=params['MIN_OCCURRENCES_VOCAB'],
                                required=False,
                                overwrite_split=True)
                    ds.setInput([],
                                'test',
                                type='text',
                                id=params['INPUTS_IDS_DATASET'][2],
                                build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                                tokenization=params['TOKENIZATION_METHOD'],
                                fill=params['FILL'],
                                pad_on_batch=True,
                                max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                                min_occ=params['MIN_OCCURRENCES_VOCAB'],
                                required=False,
                                overwrite_split=True)
            else:
                ds, repeat_images = insertTemporallyLinkedCaptions(ds,
                                                                   params,
                                                                   set_names=['train', 'val', 'test'],
                                                                   upperbound=True,
                                                                   video='-video' in params['DATASET_NAME'],
                                                                   copy='-copy' in params['DATASET_NAME'],
                                                                   force_nocopy='-nocopy' in params['DATASET_NAME'],
                                                                   prev='-prev' in params['DATASET_NAME'])
                num_captions_val = repeat_images['val']
                num_captions_test = repeat_images['test']

        if not '-vidtext-embed' in params['DATASET_NAME']:
            # Process dataset for keeping only one caption per video and storing the rest in a dict() with the following format:
            #        ds.extra_variables[set_name][id_output][img_position] = [cap1, cap2, cap3, ..., capN]
            keep_n_captions(ds, repeat=[num_captions_val, num_captions_test], n=1, set_names=['val', 'test'])

        else:
            # Set outputs for -vidtext-embed model
            insertVidTextEmbedNegativeSamples(ds, params,
                                              repeat=[num_captions_train, num_captions_val, num_captions_test])

        if not '-vidtext-embed' in params['DATASET_NAME'] and \
                        '-linked' in params['DATASET_NAME'] and \
                        '-upperbound' not in params['DATASET_NAME'] and \
                        '-video' not in params['DATASET_NAME']:
            # Set previous data indices
            for s, file in params['LINK_SAMPLE_FILES'].iteritems():
                if s in repeat_images:
                    rep = repeat_images[s]
                else:
                    rep = 1
                ds.setInput(base_path + '/' + file,
                            s,
                            type='id',
                            id=params['INPUTS_IDS_DATASET'][-1],
                            repeat_set=rep)

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl')

    # Load vocabulary-related parameters of dataset used for pre-training
    if params['PRE_TRAINED_DATASET_NAME'] is not None:
        logging.info('Re-using previous dataset vocabulary ' + params['PRE_TRAINED_DATASET_NAME'])
        dataset_pretrained = loadDataset(
            params['DATASET_STORE_PATH'] + 'Dataset_' + params['PRE_TRAINED_DATASET_NAME'] + '.pkl')
        for id_new, id_old in params['VOCABULARIES_MAPPING'].iteritems():
            ds.vocabulary[id_new] = copy.deepcopy(dataset_pretrained.vocabulary[id_old])
            ds.vocabulary_len[id_new] = copy.deepcopy(dataset_pretrained.vocabulary_len[id_old])
    elif params['PRE_TRAINED_VOCABULARY_NAME'] is not None:
        logging.info('Re-using previous vocabulary ' + params['PRE_TRAINED_VOCABULARY_NAME'])
        dataset_pretrained_vocabulary = pkl2dict(
            params['DATASET_STORE_PATH'] + params['PRE_TRAINED_VOCABULARY_NAME'] + '.pkl')
        for id_new, id_old in params['VOCABULARIES_MAPPING'].iteritems():
            ds.vocabulary[id_new] = copy.deepcopy(dataset_pretrained_vocabulary[id_old])
            ds.vocabulary_len[id_new] = len(dataset_pretrained_vocabulary[id_old]['idx2words'])

    return ds