Exemple #1
0
def build_dataset(params):
    """
    Builds (or loads) a Dataset instance.
    :param params: Parameters specifying Dataset options
    :return: Dataset object
    """

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if params['VERBOSE'] > 0:
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + '_' +
                         params['SRC_LAN'] + params['TRG_LAN'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[
            'TRG_LAN']
        ds = Dataset(name, base_path, silence=silence)

        # OUTPUT DATA
        # Let's load the train, val and test splits of the target language sentences (outputs)
        #    the files include a sentence per line.
        ds.setOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                     params['TRG_LAN'],
                     'train',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     tokenization=params.get('TOKENIZATION_METHOD',
                                             'tokenize_none'),
                     build_vocabulary=True,
                     pad_on_batch=params.get('PAD_ON_BATCH', True),
                     sample_weights=params.get('SAMPLE_WEIGHTS', True),
                     fill=params.get('FILL', 'end'),
                     max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                     max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                     min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0))
        if params.get('ALIGN_FROM_RAW',
                      True) and not params.get('HOMOGENEOUS_BATCHES', False):
            ds.setRawOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                            params['TRG_LAN'],
                            'train',
                            type='file-name',
                            id='raw_' + params['OUTPUTS_IDS_DATASET'][0])

        for split in ['val', 'test']:
            if params['TEXT_FILES'].get(split) is not None:
                ds.setOutput(base_path + '/' + params['TEXT_FILES'][split] +
                             params['TRG_LAN'],
                             split,
                             type='text',
                             id=params['OUTPUTS_IDS_DATASET'][0],
                             pad_on_batch=params.get('PAD_ON_BATCH', True),
                             tokenization=params.get('TOKENIZATION_METHOD',
                                                     'tokenize_none'),
                             sample_weights=params.get('SAMPLE_WEIGHTS', True),
                             max_text_len=params.get('MAX_OUTPUT_TEXT_LEN',
                                                     70),
                             max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0))
                if params.get('ALIGN_FROM_RAW', True) and not params.get(
                        'HOMOGENEOUS_BATCHES', False):
                    ds.setRawOutput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['TRG_LAN'],
                        split,
                        type='file-name',
                        id='raw_' + params['OUTPUTS_IDS_DATASET'][0])

        # INPUT DATA
        # We must ensure that the 'train' split is the first (for building the vocabulary)
        for split in ['train', 'val', 'test']:
            if params['TEXT_FILES'].get(split) is not None:
                if split == 'train':
                    build_vocabulary = True
                else:
                    build_vocabulary = False
                ds.setInput(base_path + '/' + params['TEXT_FILES'][split] +
                            params['SRC_LAN'],
                            split,
                            type='text',
                            id=params['INPUTS_IDS_DATASET'][0],
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            build_vocabulary=build_vocabulary,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_INPUT_TEXT_LEN', 70),
                            max_words=params.get('INPUT_VOCABULARY_SIZE', 0),
                            min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB',
                                               0))

                if len(params['INPUTS_IDS_DATASET']) > 1:
                    if 'train' in split:
                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            params['TRG_LAN'],
                            split,
                            type='text',
                            id=params['INPUTS_IDS_DATASET'][1],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                            offset=1,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0))
                    else:
                        ds.setInput(None,
                                    split,
                                    type='ghost',
                                    id=params['INPUTS_IDS_DATASET'][-1],
                                    required=False)
                if params.get('ALIGN_FROM_RAW', True) and not params.get(
                        'HOMOGENEOUS_BATCHES', False):
                    ds.setRawInput(base_path + '/' +
                                   params['TEXT_FILES'][split] +
                                   params['SRC_LAN'],
                                   split,
                                   type='file-name',
                                   id='raw_' + params['INPUTS_IDS_DATASET'][0])

        if params.get('POS_UNK', False):
            if params.get('HEURISTIC', 0) > 0:
                ds.loadMapping(params['MAPPING'])

        # If we had multiple references per sentence
        keep_n_captions(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])

    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '_' + params['SRC_LAN'] +
                         params['TRG_LAN'] + '.pkl')

    return ds
def start_training(use_gpu):

    ds = Dataset('tutorial_dataset', 'tutorial', silence=False)
    ds.setOutput(DATA_PATH + "train_y.txt",
                 'train',
                 type='text',
                 id='target_text',
                 tokenization='tokenize_basic',
                 build_vocabulary=True,
                 pad_on_batch=True,
                 sample_weights=True,
                 max_text_len=30,
                 max_words=30000,
                 min_occ=0)

    ds.setOutput(DATA_PATH + "val_y.txt",
                 'val',
                 type='text',
                 id='target_text',
                 pad_on_batch=True,
                 tokenization='tokenize_basic',
                 sample_weights=True,
                 max_text_len=30,
                 max_words=0)

    ds.setInput(DATA_PATH + "train_x.txt",
                'train',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                build_vocabulary=True,
                fill='end',
                max_text_len=30,
                max_words=30000,
                min_occ=0)

    ds.setInput(DATA_PATH + "val_x.txt",
                'val',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                fill='end',
                max_text_len=30,
                min_occ=0)

    ds.setInput(DATA_PATH + "train_y.txt",
                'train',
                type='text',
                id='state_below',
                required=False,
                tokenization='tokenize_basic',
                pad_on_batch=True,
                build_vocabulary='target_text',
                offset=1,
                fill='end',
                max_text_len=30,
                max_words=30000)

    ds.setInput(None, 'val', type='ghost', id='state_below', required=False)

    for split, input_text_filename in zip(
        ['train', 'val'],
        [DATA_PATH + "train_x.txt", DATA_PATH + "val_x.txt"]):
        ds.setRawInput(input_text_filename,
                       split,
                       type='file-name',
                       id='raw_source_text',
                       overwrite_split=True)
    """We also need to match the references with the inputs. Since we only have one reference per input sample, we set `repeat=1`."""

    keep_n_captions(ds, repeat=1, n=1, set_names=['val'])
    """Finally, we can save our dataset instance for using in other experiments:"""

    saveDataset(ds, MODEL_PATH + "/dataset")
    """## 2. Creating and training a Neural Translation Model
    Now, we'll create and train a Neural Machine Translation (NMT) model. Since there is a significant number of hyperparameters, we'll use the default ones, specified in the `config.py` file. Note that almost every hardcoded parameter is automatically set from config if we run  `main.py `.

    We'll create an `'AttentionRNNEncoderDecoder'` (a LSTM encoder-decoder with attention mechanism). Refer to the [`model_zoo.py`](https://github.com/lvapeab/nmt-keras/blob/master/nmt_keras/model_zoo.py) file for other models (e.g. Transformer). 

    So first, let's import the model and the hyperparameters. We'll also load the dataset we stored in the previous section (not necessary as it is in memory, but as a demonstration):
    """

    params = load_parameters()
    dataset = loadDataset(MODEL_PATH + "/dataset/Dataset_tutorial_dataset.pkl")
    """Since the number of words in the dataset may be unknown beforehand, we must update the params information according to the dataset instance:"""

    params['MODEL_TYPE'] = 'Transformer'
    params['USE_CUDNN'] = use_gpu
    params['EARLY_STOP'] = True
    params['PATIENCE'] = 10
    params['SAVE_EACH_EVALUATION'] = True
    params['STORE_PATH'] = MODEL_PATH
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['N_HEADS'] = 100
    params['POS_UNK'] = False  # current Transformer model requires this
    params[
        'ATTEND_ON_OUTPUT'] = True  # current Transformer model requires this
    params['MODEL_SIZE'] = 100
    params['SOURCE_TEXT_EMBEDDING_SIZE'] = 100
    params['TARGET_TEXT_EMBEDDING_SIZE'] = 100
    params['SKIP_VECTORS_HIDDEN_SIZE'] = 100
    params['ENCODER_HIDDEN_SIZE'] = 100
    params['DECODER_HIDDEN_SIZE'] = 100
    params['APPLY_DETOKENIZATION'] = True
    params['LENGTH_PENALTY'] = True
    params['LENGTH_NORM_FACTOR'] = 0.8
    params['MAX_INPUT_TEXT_LEN'] = 128
    params['MAX_OUTPUT_TEXT_LEN'] = 128
    params['STOP_METRIC'] = 'perplexity'
    params['BEAM_SIZE'] = 20
    params['N_GPUS'] = 2
    params['START_EVAL_ON_EPOCH'] = 1
    params['BATCH_SIZE'] = 128
    params['EVAL_EACH'] = 1
    params['MAX_EPOCH'] = 100
    params['PLOT_EVALULATION'] = True
    params['APPLY_DETOKENIZATION'] = True
    params['MODE'] = 'training'
    params['BEAM_SEARCH'] = True
    params['TENSORBOARD'] = True
    train_model(params,
                load_dataset=MODEL_PATH +
                "/dataset/Dataset_tutorial_dataset.pkl")
Exemple #3
0
def build_dataset(params, vocabulary=dict(), vocabulary_len=dict()):
    """
    Builds (or loads) a Dataset instance.
    :param params: Parameters specifying Dataset options
    :return: Dataset object
    """

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if params['VERBOSE'] > 0:
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + '_' +
                         params['SRC_LAN'] + params['TRG_LAN'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[
            'TRG_LAN']
        doc_size = 0
        if 'SECOND_DIM_SIZE' in params:
            doc_size = params['SECOND_DIM_SIZE']
        ds = Dataset(name,
                     base_path,
                     silence=silence,
                     vocabulary=vocabulary,
                     vocabulary_len=vocabulary_len,
                     doc_size=doc_size)
        # OUTPUT DATA
        # Let's load the train, val and test splits of the target language sentences (outputs)
        #    the files include a sentence per line.

        if params['MODEL_TYPE'] == 'Predictor':
            if 'PRED_VOCAB' in params:
                ds.setOutput(
                    base_path + '/' + params['TEXT_FILES']['train'] +
                    params['TRG_LAN'],
                    'train',
                    type='text',
                    id=params['OUTPUTS_IDS_DATASET'][0],
                    tokenization=params.get('TOKENIZATION_METHOD',
                                            'tokenize_none'),
                    # if you want new vocabulary set build_vocabulary to True
                    build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                    pad_on_batch=params.get('PAD_ON_BATCH', True),
                    sample_weights=params.get('SAMPLE_WEIGHTS', True),
                    fill=params.get('FILL', 'end'),
                    max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                    max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                    min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0),
                    bpe_codes=params.get('BPE_CODES_PATH', None))
            else:
                ds.setOutput(
                    base_path + '/' + params['TEXT_FILES']['train'] +
                    params['TRG_LAN'],
                    'train',
                    type='text',
                    id=params['OUTPUTS_IDS_DATASET'][0],
                    tokenization=params.get('TOKENIZATION_METHOD',
                                            'tokenize_none'),
                    # if you want new vocabulary set build_vocabulary to True
                    build_vocabulary=True,
                    pad_on_batch=params.get('PAD_ON_BATCH', True),
                    sample_weights=params.get('SAMPLE_WEIGHTS', True),
                    fill=params.get('FILL', 'end'),
                    max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                    max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                    min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0),
                    bpe_codes=params.get('BPE_CODES_PATH', None))

        elif params['MODEL_TYPE'] == 'EstimatorSent' or params[
                'MODEL_TYPE'] == 'EncSent' or 'EstimatorDoc' in params[
                    'MODEL_TYPE'] or 'EncDoc' in params['MODEL_TYPE']:

            ds.setOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                         params['PRED_SCORE'],
                         'train',
                         type='real',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         tokenization=params.get('TOKENIZATION_METHOD',
                                                 'tokenize_none'),
                         build_vocabulary=False,
                         pad_on_batch=params.get('PAD_ON_BATCH', False),
                         sample_weights=params.get('SAMPLE_WEIGHTS', False),
                         fill=params.get('FILL', 'end'),
                         max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                         max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                         min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0),
                         bpe_codes=params.get('BPE_CODES_PATH', None))

        elif params['MODEL_TYPE'] == 'EstimatorWord' or params[
                'MODEL_TYPE'] == 'EncWord' or params[
                    'MODEL_TYPE'] == 'EncWordAtt' or params[
                        'MODEL_TYPE'] == 'EncPhraseAtt' or params[
                            'MODEL_TYPE'] == 'EstimatorPhrase':

            ds.setOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                         params['PRED_SCORE'],
                         'train',
                         type='text',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         tokenization=params.get('TOKENIZATION_METHOD',
                                                 'tokenize_none'),
                         build_vocabulary=True,
                         pad_on_batch=params.get('PAD_ON_BATCH', True),
                         sample_weights=params.get('SAMPLE_WEIGHTS', False),
                         fill=params.get('FILL', 'end'),
                         max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                         max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                         min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0),
                         bpe_codes=params.get('BPE_CODES_PATH', None))

        if params.get('ALIGN_FROM_RAW',
                      True) and not params.get('HOMOGENEOUS_BATCHES', False):
            ds.setRawOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                            params['TRG_LAN'],
                            'train',
                            type='file-name',
                            id='raw_' + params['OUTPUTS_IDS_DATASET'][0])

        val_test_list = params.get('EVAL_ON_SETS', ['val'])
        no_ref = params.get('NO_REF', False)
        if no_ref:
            val_test_list = []
        for split in val_test_list:
            if params['TEXT_FILES'].get(split) is not None:

                if params['MODEL_TYPE'] == 'Predictor':

                    ds.setOutput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['TRG_LAN'],
                        split,
                        type='text',
                        id=params['OUTPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        sample_weights=params.get('SAMPLE_WEIGHTS', True),
                        max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                        max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))

                elif params['MODEL_TYPE'] == 'EstimatorSent' or params[
                        'MODEL_TYPE'] == 'EncSent' or 'EstimatorDoc' in params[
                            'MODEL_TYPE'] or 'EncDoc' in params['MODEL_TYPE']:

                    ds.setOutput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['PRED_SCORE'],
                        split,
                        type='real',
                        id=params['OUTPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        sample_weights=params.get('SAMPLE_WEIGHTS', False),
                        max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                        max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))

                elif params['MODEL_TYPE'] == 'EstimatorWord' or params[
                        'MODEL_TYPE'] == 'EncWord' or params[
                            'MODEL_TYPE'] == 'EncWordAtt' or params[
                                'MODEL_TYPE'] == 'EncPhraseAtt' or params[
                                    'MODEL_TYPE'] == 'EstimatorPhrase':

                    ds.setOutput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['PRED_SCORE'],
                        split,
                        type='text',
                        id=params['OUTPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        sample_weights=params.get('SAMPLE_WEIGHTS', False),
                        max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                        max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))

                if params.get('ALIGN_FROM_RAW', True) and not params.get(
                        'HOMOGENEOUS_BATCHES', False):
                    ds.setRawOutput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['TRG_LAN'],
                        split,
                        type='file-name',
                        id='raw_' + params['OUTPUTS_IDS_DATASET'][0])

        # INPUT DATA
        # We must ensure that the 'train' split is the first (for building the vocabulary)

        max_src_in_len = params.get('MAX_SRC_INPUT_TEXT_LEN', None)
        if max_src_in_len == None:
            params['MAX_SRC_INPUT_TEXT_LEN'] = params['MAX_INPUT_TEXT_LEN']

        max_trg_in_len = params.get('MAX_TRG_INPUT_TEXT_LEN', None)
        if max_trg_in_len == None:
            params['MAX_TRG_INPUT_TEXT_LEN'] = params['MAX_INPUT_TEXT_LEN']

        data_type_src = 'text'
        data_type_trg = 'text'

        if 'EstimatorDoc' in params['MODEL_TYPE'] or 'EncDoc' in params[
                'MODEL_TYPE']:
            data_type_src = 'doc'
            data_type_trg = 'doc'

        # here we set to doc meaning just the 3d input
        if params['MODEL_TYPE'] == 'EstimatorPhrase' or params[
                'MODEL_TYPE'] == 'EncPhraseAtt':
            data_type_trg = 'doc'

        ext = params['TRG_LAN']
        target_dict = 'target_text'

        #if params['MODEL_TYPE'] != 'Predictor':
        #    ext = 'mt'

        for split in ['train', 'val', 'test']:
            if params['TEXT_FILES'].get(split) is not None:
                if split == 'train':
                    build_vocabulary = True
                else:
                    build_vocabulary = False
                if 'PRED_VOCAB' in params:

                    ds.setInput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['SRC_LAN'],
                        split,
                        type=data_type_src,
                        id=params['INPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        build_vocabulary=params['INPUTS_IDS_DATASET'][0],
                        fill=params.get('FILL', 'end'),
                        max_text_len=params.get('MAX_SRC_INPUT_TEXT_LEN', 70),
                        max_words=params.get('INPUT_VOCABULARY_SIZE', 0),
                        min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))
                else:

                    ds.setInput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['SRC_LAN'],
                        split,
                        type=data_type_src,
                        id=params['INPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        build_vocabulary=build_vocabulary,
                        fill=params.get('FILL', 'end'),
                        max_text_len=params.get('MAX_SRC_INPUT_TEXT_LEN', 70),
                        max_words=params.get('INPUT_VOCABULARY_SIZE', 0),
                        min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))

                if len(params['INPUTS_IDS_DATASET']) == 2:
                    if 'PRED_VOCAB' not in params and 'train' in split:

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][1],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=build_vocabulary,
                            offset=0,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                    else:
                        # ds.setInput(None,
                        #             split,
                        #             type='ghost',
                        #             id=params['INPUTS_IDS_DATASET'][-1],
                        #             required=False)

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][1],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=0,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                if len(params['INPUTS_IDS_DATASET']) > 2:
                    if 'PRED_VOCAB' not in params and 'train' in split:

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][1],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=build_vocabulary,
                            offset=1,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][2],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=-1,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][3],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=0,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                    else:
                        # ds.setInput(None,
                        #             split,
                        #             type='ghost',
                        #             id=params['INPUTS_IDS_DATASET'][-1],
                        #             required=False)

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][1],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=1,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][2],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=-1,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            ext,
                            split,
                            type=data_type_trg,
                            id=params['INPUTS_IDS_DATASET'][3],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=target_dict,
                            offset=0,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_TRG_INPUT_TEXT_LEN',
                                                    3),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                            bpe_codes=params.get('BPE_CODES_PATH', None))

                if params.get('ALIGN_FROM_RAW', True) and not params.get(
                        'HOMOGENEOUS_BATCHES', False):
                    ds.setRawInput(base_path + '/' +
                                   params['TEXT_FILES'][split] +
                                   params['SRC_LAN'],
                                   split,
                                   type='file-name',
                                   id='raw_' + params['INPUTS_IDS_DATASET'][0])

        if params.get('POS_UNK', False):
            if params.get('HEURISTIC', 0) > 0:
                ds.loadMapping(params['MAPPING'])

        # If we had multiple references per sentence
        if not params.get('NO_REF', False):
            keep_n_captions(ds,
                            repeat=1,
                            n=1,
                            set_names=params['EVAL_ON_SETS'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])

    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '_' + params['SRC_LAN'] +
                         params['TRG_LAN'] + '.pkl')

    return ds
Exemple #4
0
def start_training(use_gpu):

    ds = Dataset('tutorial_dataset', 'tutorial', silence=False)
    ds.setOutput(PATH + "train_correct.txt",
                 'train',
                 type='text',
                 id='target_text',
                 tokenization='tokenize_basic',
                 build_vocabulary=True,
                 pad_on_batch=True,
                 sample_weights=True,
                 max_text_len=100,
                 max_words=55000,
                 min_occ=1)

    ds.setOutput(PATH + "validation_correct.txt",
                 'val',
                 type='text',
                 id='target_text',
                 pad_on_batch=True,
                 tokenization='tokenize_basic',
                 sample_weights=True,
                 max_text_len=100,
                 max_words=0)

    ds.setInput(PATH + "train_error.txt",
                'train',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                build_vocabulary=True,
                fill='end',
                max_text_len=100,
                max_words=55000,
                min_occ=1)

    ds.setInput(PATH + "validation_error.txt",
                'val',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                fill='end',
                max_text_len=100,
                min_occ=1)
    """...and for the 'state_below' data. Note that: 1) The offset flat is set to 1, which means that the text will be shifted to the right 1 position. 2) During sampling time, we won't have this input. Hence, we 'hack' the dataset model by inserting an artificial input, of type 'ghost' for the validation split."""

    ds.setInput(PATH + "train_correct.txt",
                'train',
                type='text',
                id='state_below',
                required=False,
                tokenization='tokenize_basic',
                pad_on_batch=True,
                build_vocabulary='target_text',
                offset=1,
                fill='end',
                max_text_len=100,
                max_words=55000)
    ds.setInput(None, 'val', type='ghost', id='state_below', required=False)
    """We can also keep the literal source words (for replacing unknown words)."""

    for split, input_text_filename in zip(
        ['train', 'val'],
        [PATH + "train_error.txt", PATH + "validation_error.txt"]):
        ds.setRawInput(input_text_filename,
                       split,
                       type='file-name',
                       id='raw_source_text',
                       overwrite_split=True)
    """We also need to match the references with the inputs. Since we only have one reference per input sample, we set `repeat=1`."""

    keep_n_captions(ds, repeat=1, n=1, set_names=['val'])
    """Finally, we can save our dataset instance for using in other experiments:"""

    saveDataset(ds, PATH + "dataset")
    """## 2. Creating and training a Neural Translation Model
    Now, we'll create and train a Neural Machine Translation (NMT) model. Since there is a significant number of hyperparameters, we'll use the default ones, specified in the `config.py` file. Note that almost every hardcoded parameter is automatically set from config if we run  `main.py `.

    We'll create an `'AttentionRNNEncoderDecoder'` (a LSTM encoder-decoder with attention mechanism). Refer to the [`model_zoo.py`](https://github.com/lvapeab/nmt-keras/blob/master/nmt_keras/model_zoo.py) file for other models (e.g. Transformer). 

    So first, let's import the model and the hyperparameters. We'll also load the dataset we stored in the previous section (not necessary as it is in memory, but as a demonstration):
    """

    params = load_parameters()
    dataset = loadDataset(PATH + "dataset/Dataset_tutorial_dataset.pkl")
    """Since the number of words in the dataset may be unknown beforehand, we must update the params information according to the dataset instance:"""

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['source_text']
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['target_text']
    params['USE_CUDNN'] = use_gpu
    params['N_GPUS'] = 2
    params['MAX_EPOCH'] = 1000
    params['EARLY_STOP'] = True
    params['PATIENCE'] = 10
    params['SAVE_EACH_EVALUATION'] = True
    params['STORE_PATH'] = PATH + "model/"
    params['BATCH_SIZE'] = 128
    params['ATTENTION_MODE'] = "add"
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['SOURCE_TEXT_EMBEDDING_SIZE'] = 512
    params['TARGET_TEXT_EMBEDDING_SIZE'] = 512
    params['SKIP_VECTORS_HIDDEN_SIZE'] = 512
    params['ATTENTION_SIZE'] = 512
    params['ENCODER_HIDDEN_SIZE'] = 512
    params['DECODER_HIDDEN_SIZE'] = 512
    params['ENCODER_RNN_TYPE'] = "LSTM"
    params['DECODER_RNN_TYPE'] = "ConditionalLSTM"
    params['METRICS'] = ['coco']
    params['KERAS_METRICS'] = ['perplexity']
    params['APPLY_DETOKENIZATION'] = True
    params['LENGTH_PENALTY'] = True
    params['LENGTH_NORM_FACTOR'] = 1.0
    params['BEAM_SIZE'] = 1
    params['BEAM_SEARCH'] = True
    params['PLOT_EVALUATION'] = True
    params['MAX_PLOT_Y'] = 1.
    params['MODE'] = 'training'
    params['TENSORBOARD'] = True

    result = pyfiglet.figlet_format("START TRAINING FROM SCRATCH".format(mode),
                                    font="digital")
    print(result)
    train_model(params,
                load_dataset=os.getcwd() +
                "/dataset/Dataset_tutorial_dataset.pkl")
def build_dataset(params):
    """
    Builds (or loads) a Dataset instance.
    :param params: Parameters specifying Dataset options
    :return: Dataset object
    """

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if params['VERBOSE'] > 0:
            silence = False
            logger.info('Building ' + params['DATASET_NAME'] + '_' +
                        params['SRC_LAN'] + params['TRG_LAN'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[
            'TRG_LAN']
        ds = Dataset(name, base_path, silence=silence)

        # OUTPUT DATA
        # Load the train, val and test splits of the target language sentences (outputs). The files include a sentence per line.
        ds.setOutput(
            os.path.join(base_path,
                         params['TEXT_FILES']['train'] + params['TRG_LAN']),
            'train',
            type=params.get(
                'OUTPUTS_TYPES_DATASET',
                ['dense-text'] if 'sparse' in params['LOSS'] else ['text'])[0],
            id=params['OUTPUTS_IDS_DATASET'][0],
            tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'),
            build_vocabulary=True,
            pad_on_batch=params.get('PAD_ON_BATCH', True),
            sample_weights=params.get('SAMPLE_WEIGHTS', True),
            fill=params.get('FILL', 'end'),
            max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
            min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0),
            bpe_codes=params.get('BPE_CODES_PATH', None),
            label_smoothing=params.get('LABEL_SMOOTHING', 0.))

        for split in ['val', 'test']:
            if params['TEXT_FILES'].get(split) is not None:
                ds.setOutput(
                    os.path.join(
                        base_path,
                        params['TEXT_FILES'][split] + params['TRG_LAN']),
                    split,
                    type=
                    'text',  # The type of the references should be always 'text'
                    id=params['OUTPUTS_IDS_DATASET'][0],
                    pad_on_batch=params.get('PAD_ON_BATCH', True),
                    tokenization=params.get('TOKENIZATION_METHOD',
                                            'tokenize_none'),
                    sample_weights=params.get('SAMPLE_WEIGHTS', True),
                    max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                    max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                    bpe_codes=params.get('BPE_CODES_PATH', None),
                    label_smoothing=0.)

        # INPUT DATA
        # We must ensure that the 'train' split is the first (for building the vocabulary)
        for split in params['TEXT_FILES']:
            build_vocabulary = split == 'train'
            ds.setInput(os.path.join(
                base_path, params['TEXT_FILES'][split] + params['SRC_LAN']),
                        split,
                        type=params.get('INPUTS_TYPES_DATASET',
                                        ['text', 'text'])[0],
                        id=params['INPUTS_IDS_DATASET'][0],
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        build_vocabulary=build_vocabulary,
                        fill=params.get('FILL', 'end'),
                        max_text_len=params.get('MAX_INPUT_TEXT_LEN', 70),
                        max_words=params.get('INPUT_VOCABULARY_SIZE', 0),
                        min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))

            if len(params['INPUTS_IDS_DATASET']) > 1:
                if 'train' in split:
                    ds.setInput(
                        os.path.join(
                            base_path,
                            params['TEXT_FILES'][split] + params['TRG_LAN']),
                        split,
                        type=params.get('INPUTS_TYPES_DATASET',
                                        ['text', 'text'])[1],
                        id=params['INPUTS_IDS_DATASET'][1],
                        required=False,
                        tokenization=params.get('TOKENIZATION_METHOD',
                                                'tokenize_none'),
                        pad_on_batch=params.get('PAD_ON_BATCH', True),
                        build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                        offset=1,
                        fill=params.get('FILL', 'end'),
                        max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                        max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                        bpe_codes=params.get('BPE_CODES_PATH', None))
                    if params.get('TIE_EMBEDDINGS', False):
                        ds.merge_vocabularies([
                            params['INPUTS_IDS_DATASET'][1],
                            params['INPUTS_IDS_DATASET'][0]
                        ])
                else:
                    ds.setInput(None,
                                split,
                                type='ghost',
                                id=params['INPUTS_IDS_DATASET'][-1],
                                required=False)
            if params.get('ALIGN_FROM_RAW', True) and not params.get(
                    'HOMOGENEOUS_BATCHES', False):
                ds.setRawInput(os.path.join(
                    base_path,
                    params['TEXT_FILES'][split] + params['SRC_LAN']),
                               split,
                               type='file-name',
                               id='raw_' + params['INPUTS_IDS_DATASET'][0])
        if params.get('POS_UNK', False):
            if params.get('HEURISTIC', 0) > 0:
                ds.loadMapping(params['MAPPING'])
        # Prepare references
        prepare_references(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])

    else:
        # We can easily recover it with a single line
        ds = loadDataset(
            os.path.join(
                params['DATASET_STORE_PATH'],
                'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] +
                params['TRG_LAN'] + '.pkl'))

        # Prepare references
        prepare_references(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS'])

    return ds