Beispiel #1
0
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict):

	s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')

	# Merge 
	stream = Merge([s_dataset.get_example_stream(),
                    t_dataset.get_example_stream()],
                   ('source', 'target'))
	# Filter -- TODO 
	stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len']))

	# Map - no need 

	# Batch - Sort 
	stream = Batch(stream, 
		iteration_scheme=ConstantScheme(
			configuration['batch_size']*configuration['sort_k_batches']))
	stream = Mapping(stream, SortMapping(_length))
	stream = Unpack(stream)
	stream = Batch(
        stream, iteration_scheme=ConstantScheme(configuration['batch_size']))

	# Pad 
	# Note that </s>=0. Fuel only allows padding 0 by default 
	masked_stream = Padding(stream)

	return masked_stream
Beispiel #2
0
def _get_align_stream(src_data, trg_data, src_vocab_size, trg_vocab_size,
                      seq_len, **kwargs):
    """Creates the stream which is used for the main loop.
    
    Args:
        src_data (string): Path to the source sentences
        trg_data (string): Path to the target sentences
        src_vocab_size (int): Size of the source vocabulary in the NMT
                              model
        trg_vocab_size (int): Size of the target vocabulary in the NMT
                              model
        seq_len (int): Maximum length of any source or target sentence
    
    Returns:
        ExplicitNext. Alignment data stream which can be iterated
        explicitly
    """
    # Build dummy vocabulary to make TextFile happy
    src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)})
    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)
    # Merge them to get a source, target pair
    s = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))
    s = Batch(s, iteration_scheme=ConstantScheme(1))
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])
    return ExplicitNext(masked_stream)
Beispiel #3
0
def get_dev_stream_with_topicalq(test_set=None,
                                 src_vocab=None,
                                 src_vocab_size=30000,
                                 topical_test_set=None,
                                 topical_vocab=None,
                                 topical_vocab_size=2000,
                                 unk_id=1,
                                 **kwargs):
    """Setup development set stream if necessary."""
    dev_stream = None
    if test_set is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        print test_set, type(src_vocab)
        topical_vocab = cPickle.load(open(topical_vocab, 'rb'))
        #not ensure special token.
        topical_dataset = TextFile([topical_test_set], topical_vocab, None,
                                   None, '10')
        dev_dataset = TextFile([test_set], src_vocab, None)
        #dev_stream = DataStream(dev_dataset)
        # Merge them to get a source, target pair
        dev_stream = Merge([
            dev_dataset.get_example_stream(),
            topical_dataset.get_example_stream()
        ], ('source', 'source_topical'))
    return dev_stream
Beispiel #4
0
def _get_align_stream(src_data, 
                      trg_data, 
                      src_vocab_size, 
                      trg_vocab_size, 
                      seq_len, 
                      **kwargs):
    """Creates the stream which is used for the main loop.
    
    Args:
        src_data (string): Path to the source sentences
        trg_data (string): Path to the target sentences
        src_vocab_size (int): Size of the source vocabulary in the NMT
                              model
        trg_vocab_size (int): Size of the target vocabulary in the NMT
                              model
        seq_len (int): Maximum length of any source or target sentence
    
    Returns:
        ExplicitNext. Alignment data stream which can be iterated
        explicitly
    """
    # Build dummy vocabulary to make TextFile happy
    src_vocab = _add_special_ids({str(i) : i for i in xrange(src_vocab_size)})
    trg_vocab = _add_special_ids({str(i) : i for i in xrange(trg_vocab_size)})
    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)
    # Merge them to get a source, target pair
    s = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))
    s = Batch(s, iteration_scheme=ConstantScheme(1))
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])
    return ExplicitNext(masked_stream)
def get_dev_stream(val_set=None,
                   valid_sent_dict=None,
                   src_vocab=None,
                   trg_vocab=None,
                   src_vocab_size=30000,
                   trg_vocab_size=30000,
                   unk_id=1,
                   **kwargs):
    """Setup development set stream if necessary."""

    dev_stream = None
    if val_set is not None and src_vocab is not None:
        # Load dictionaries and ensure special tokens exist
        src_vocab = ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab)),
                                          bos_idx=0,
                                          eos_idx=src_vocab_size - 1,
                                          unk_idx=unk_id)

        trg_vocab = ensure_special_tokens(trg_vocab if isinstance(
            trg_vocab, dict) else cPickle.load(open(trg_vocab)),
                                          bos_idx=0,
                                          eos_idx=trg_vocab_size - 1,
                                          unk_idx=unk_id)

        dev_dataset = TextFile([val_set], src_vocab, None)
        dev_dictset = TextFile([valid_sent_dict], trg_vocab, None)
        #dev_stream = DataStream(dev_dataset)
        # Merge them to get a source, target pair
        dev_stream = Merge([
            dev_dataset.get_example_stream(),
            dev_dictset.get_example_stream()
        ], ('source', 'valid_sent_trg_dict'))
    return dev_stream
Beispiel #6
0
def get_logprob_streams(config):
    if 'log_prob_sets' not in config:
        return None

    cgs = config['cgs']
    enc_ids, dec_ids = get_enc_dec_ids(cgs)
    datasets = config['log_prob_sets']

    # Prepare source vocabs and files, make sure special tokens are there
    src_vocabs = {k: cPickle.load(open(v))
                  for k, v in config['src_vocabs'].iteritems()}
    for k in src_vocabs.keys():
        src_vocabs[k]['<S>'] = 0
        src_vocabs[k]['</S>'] = config['src_eos_idxs'][k]
        src_vocabs[k]['<UNK>'] = config['unk_id']

    # Prepare target vocabs and files, make sure special tokens are there
    trg_vocabs = {k: cPickle.load(open(v))
                  for k, v in config['trg_vocabs'].iteritems()}
    for k in trg_vocabs.keys():
        trg_vocabs[k]['<S>'] = 0
        trg_vocabs[k]['</S>'] = config['trg_eos_idxs'][k]
        trg_vocabs[k]['<UNK>'] = config['unk_id']

    # Build the preprocessing pipeline for individual streams
    ind_streams = {}
    for cg in cgs:
        eid, did = p_(cg)
        if cg not in datasets:
            continue
        logger.info('Building logprob stream for cg:[{}]'.format(cg))
        src_dataset = TextFile([datasets[cg][0]], src_vocabs[p_(cg)[0]], None)
        trg_dataset = TextFile([datasets[cg][1]], trg_vocabs[p_(cg)[1]], None)
        stream = Merge([src_dataset.get_example_stream(),
                        trg_dataset.get_example_stream()],
                       ('source', 'target'))

        stream = Mapping(stream, _oov_to_unk(
                         src_vocab_size=config['src_vocab_sizes'][eid],
                         trg_vocab_size=config['trg_vocab_sizes'][did],
                         unk_id=config['unk_id']))
        bs = 100
        if 'log_prob_bs' in config:
            if isinstance(config['log_prob_bs'], dict):
                bs = config['log_prob_bs'][cg]
            else:
                bs = config['log_prob_bs']

        stream = Batch(stream, iteration_scheme=ConstantScheme(bs))

        masked_stream = Padding(stream)
        masked_stream = Mapping(
            masked_stream, _remapWordIdx(
                [(0, 0, config['src_eos_idxs'][eid]),
                 (2, 0, config['trg_eos_idxs'][did])]))
        ind_streams[cg] = masked_stream

    return ind_streams
Beispiel #7
0
def get_src_trg_stream(cg, config, src_datasets=None, trg_datasets=None,
                       is_training=True, src_vocabs=None, trg_vocabs=None,
                       logprob_datasets=None):
    eid, did = p_(cg)
    if is_training:
        logger.info(' ... src:[{}] - [{}]'.format(
            eid, src_datasets[cg].files[0]))
        logger.info(' ... trg:[{}] - [{}]'.format(
            did, trg_datasets[cg].files[0]))
        stream = Merge([src_datasets[cg].get_example_stream(),
                        trg_datasets[cg].get_example_stream()],
                       ('source', 'target'))
        stream = Filter(stream, predicate=_too_long(config['src_seq_len'],
                                                    config['tgt_seq_len']))

        if 'min_seq_lens' in config and config['min_seq_lens'][cg] > 0:
            stream = Filter(stream,
                            predicate=_too_short(config['min_seq_lens'][cg]))

        stream = Mapping(stream, _oov_to_unk(
                         src_vocab_size=config['src_vocab_sizes'][eid],
                         trg_vocab_size=config['trg_vocab_sizes'][did],
                         unk_id=config['unk_id']))
        stream = Batch(
            stream, iteration_scheme=ConstantScheme(
                config['batch_sizes'][cg]*config['sort_k_batches']))

        stream = Mapping(stream, SortMapping(_length))
        stream = Unpack(stream)
        stream = Batch(stream, iteration_scheme=ConstantScheme(
            config['batch_sizes'][cg]))
    else:  # logprob stream
        src_dataset = TextFile([logprob_datasets[cg][0]],
                               src_vocabs[p_(cg)[0]], None)
        trg_dataset = TextFile([logprob_datasets[cg][1]],
                               trg_vocabs[p_(cg)[1]], None)
        stream = Merge([src_dataset.get_example_stream(),
                        trg_dataset.get_example_stream()],
                       ('source', 'target'))
        stream = Mapping(stream, _oov_to_unk(
                         src_vocab_size=config['src_vocab_sizes'][eid],
                         trg_vocab_size=config['trg_vocab_sizes'][did],
                         unk_id=config['unk_id']))
        bs = 100
        if 'log_prob_bs' in config:
            if isinstance(config['log_prob_bs'], dict):
                bs = config['log_prob_bs'][cg]
            else:
                bs = config['log_prob_bs']
        stream = Batch(stream, iteration_scheme=ConstantScheme(bs))

    masked_stream = Padding(stream)
    masked_stream = Mapping(
        masked_stream, _remapWordIdx(
            [(0, 0, config['src_eos_idxs'][eid]),
             (2, 0, config['trg_eos_idxs'][did])]))
    return masked_stream
Beispiel #8
0
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data,
                  src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab, 'rb')),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab, 'rb')),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream
Beispiel #9
0
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data,
                  src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream
Beispiel #10
0
def get_sgnmt_tr_stream(src_data,
                        trg_data,
                        src_vocab_size=30000,
                        trg_vocab_size=30000,
                        unk_id=1,
                        seq_len=50,
                        batch_size=80,
                        sort_k_batches=12,
                        **kwargs):
    """Prepares the unshuffled training data stream. This corresponds 
    to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the
    blocks examples."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    s = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(
        s,
        stream._oov_to_unk(src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Beispiel #11
0
def get_test_stream_withContext_grdTruth(test_ctx_datas=None,
                                         test_set_source=None,
                                         test_set_target=None,
                                         src_vocab=None,
                                         src_vocab_size=30000,
                                         trg_vocab=None,
                                         trg_vocab_size=30000,
                                         batch_size=128,
                                         unk_id=1,
                                         ctx_num=3,
                                         **kwargs):
    """Setup development set stream if necessary."""
    masked_stream = None
    if test_set_source is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
            trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=trg_vocab_size - 1,
                                           unk_idx=unk_id)
        print test_set_source, type(src_vocab)
        # Get text files from both source and target
        ctx_datasets = []
        for i in range(ctx_num):
            ctx_datasets.append(TextFile([test_ctx_datas[i]], src_vocab, None))
        dev_dataset = TextFile([test_set_source], src_vocab, None)
        dev_target = TextFile([test_set_target], trg_vocab, None)
        dev_stream = Merge([i.get_example_stream() for i in ctx_datasets] + [
            dev_dataset.get_example_stream(),
            dev_target.get_example_stream()
        ],
                           tuple('context_' + str(i) for i in range(ctx_num)) +
                           ('source', 'target'))
        stream = Mapping(
            dev_stream,
            _oov_to_unk(ctx_num=ctx_num,
                        src_vocab_size=src_vocab_size,
                        trg_vocab_size=trg_vocab_size,
                        unk_id=unk_id))

        # Build a batched version of stream to read k batches ahead
        stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
        masked_stream = PaddingWithEOSContext(
            stream, [src_vocab_size - 1
                     for i in range(ctx_num + 1)] + [trg_vocab_size - 1])

    return masked_stream
Beispiel #12
0
def get_dev_stream_with_grdTruth(val_set_source=None,
                                 val_set_target=None,
                                 src_vocab=None,
                                 src_vocab_size=30000,
                                 trg_vocab=None,
                                 trg_vocab_size=30000,
                                 batch_size=128,
                                 unk_id=1,
                                 seq_len=50,
                                 **kwargs):
    """Setup development set stream if necessary."""
    dev_stream = None
    if val_set_source is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
            trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=trg_vocab_size - 1,
                                           unk_idx=unk_id)

        print val_set_source, type(src_vocab)
        dev_dataset = TextFile([val_set_source], src_vocab, None)
        trg_dataset = TextFile([val_set_target], trg_vocab, None)
        # Merge them to get a source, target pair
        dev_stream = Merge([
            dev_dataset.get_example_stream(),
            trg_dataset.get_example_stream()
        ], ('dev_source', 'dev_target'))
        # Filter sequences that are too long
        stream = Filter(dev_stream, predicate=_too_long(seq_len=seq_len))

        # Replace out of vocabulary tokens with unk token
        stream = Mapping(
            stream,
            _oov_to_unk(src_vocab_size=src_vocab_size,
                        trg_vocab_size=trg_vocab_size,
                        unk_id=unk_id))

        # Build a batched version of stream to read k batches ahead
        stream = Batch(stream, iteration_scheme=ConstantScheme(1))
        # Pad sequences that are short
        masked_stream = PaddingWithEOS(
            stream, [src_vocab_size - 1, trg_vocab_size - 1])
    return masked_stream
Beispiel #13
0
def get_dev_stream_with_context_features(val_context_features=None, val_set=None, src_vocab=None,
                                         src_vocab_size=30000, unk_id=1, **kwargs):
    """Setup development set stream if necessary."""

    def _get_np_array(filename):
        return numpy.load(filename)['arr_0']


    dev_stream = None
    if val_set is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        dev_dataset = TextFile([val_set], src_vocab, None)

        # now add the source with the image features
        # create the image datastream (iterate over a file line-by-line)
        con_features = _get_np_array(val_context_features)
        con_feature_dataset = IterableDataset(con_features)
        valid_image_stream = DataStream(con_feature_dataset)

        # dev_stream = DataStream(dev_dataset)
        dev_stream = Merge([dev_dataset.get_example_stream(),
                            valid_image_stream], ('source', 'initial_context'))
    #         dev_stream = dev_stream.get_example_stream()

    return dev_stream
Beispiel #14
0
def get_log_prob_stream(cg, config):
    eid, did = p_(cg)
    dataset = config['log_prob_sets'][cg]

    # Prepare source vocabs and files, make sure special tokens are there
    src_vocab = cPickle.load(open(config['src_vocabs'][eid]))
    src_vocab['<S>'] = 0
    src_vocab['</S>'] = config['src_eos_idxs'][eid]
    src_vocab['<UNK>'] = config['unk_id']

    # Prepare target vocabs and files, make sure special tokens are there
    trg_vocab = cPickle.load(open(config['trg_vocabs'][did]))
    trg_vocab['<S>'] = 0
    trg_vocab['</S>'] = config['trg_eos_idxs'][did]
    trg_vocab['<UNK>'] = config['unk_id']

    # Build the preprocessing pipeline for individual streams
    logger.info('Building logprob stream for cg:[{}]'.format(cg))
    src_dataset = TextFile([dataset[0]], src_vocab, None)
    trg_dataset = TextFile([dataset[1]], trg_vocab, None)
    stream = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    stream = Mapping(
        stream,
        _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid],
                    trg_vocab_size=config['trg_vocab_sizes'][did],
                    unk_id=config['unk_id']))
    bs = 100
    if 'log_prob_bs' in config:
        if isinstance(config['log_prob_bs'], dict):
            bs = config['log_prob_bs'][cg]
        else:
            bs = config['log_prob_bs']
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(bs,
                                                   num_examples=get_num_lines(
                                                       dataset[0])))

    masked_stream = Padding(stream)
    masked_stream = Mapping(
        masked_stream,
        _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]),
                       (2, 0, config['trg_eos_idxs'][did])]))

    return masked_stream
Beispiel #15
0
def get_dev_stream_with_prefix_file(val_set=None, val_set_grndtruth=None, val_set_prefixes=None, val_set_suffixes=None,
                                    src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1,
                                    return_vocab=False, **kwargs):
    """Setup development stream with user-provided source, target, prefixes, and suffixes"""

    dev_stream = None
    if val_set is not None and val_set_grndtruth is not None and val_set_prefixes is not None and val_set_suffixes is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        trg_vocab = _ensure_special_tokens(
            trg_vocab if isinstance(trg_vocab, dict) else
            cPickle.load(open(trg_vocab)),
            bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

        # Note: user should have already provided the EOS token in the data representation for the suffix
        # Note: The reason that we need EOS tokens in the reference file is that IMT systems need to evaluate metrics
        # Note: which count prediction of the </S> token, and evaluation scripts are called on the files
        dev_source_dataset = TextFile([val_set], src_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')
        dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')
        dev_prefix_dataset = TextFile([val_set_prefixes], trg_vocab,
                                      bos_token='<S>',
                                      eos_token=None,
                                      unk_token='<UNK>')
        dev_suffix_dataset = TextFile([val_set_suffixes], trg_vocab,
                                      bos_token=None,
                                      eos_token=None,
                                      unk_token='<UNK>')

        dev_stream = Merge([dev_source_dataset.get_example_stream(),
                            dev_target_dataset.get_example_stream(),
                            dev_prefix_dataset.get_example_stream(),
                            dev_suffix_dataset.get_example_stream()],
                           ('source', 'target','target_prefix','target_suffix'))

    if return_vocab:
        return dev_stream, src_vocab, trg_vocab
    else:
        return dev_stream
Beispiel #16
0
def get_test_stream(sfiles, svocab_dict): 
	dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	stream = Merge([dataset.get_example_stream(),], ('source', ))
	stream = Batch(
        stream, iteration_scheme=ConstantScheme(10))
	stream = Padding(stream)
	return stream
Beispiel #17
0
def get_log_prob_stream(cg, config):
    eid, did = p_(cg)
    dataset = config['log_prob_sets'][cg]

    # Prepare source vocabs and files, make sure special tokens are there
    src_vocab = cPickle.load(open(config['src_vocabs'][eid]))
    src_vocab['<S>'] = 0
    src_vocab['</S>'] = config['src_eos_idxs'][eid]
    src_vocab['<UNK>'] = config['unk_id']

    # Prepare target vocabs and files, make sure special tokens are there
    trg_vocab = cPickle.load(open(config['trg_vocabs'][did]))
    trg_vocab['<S>'] = 0
    trg_vocab['</S>'] = config['trg_eos_idxs'][did]
    trg_vocab['<UNK>'] = config['unk_id']

    # Build the preprocessing pipeline for individual streams
    logger.info('Building logprob stream for cg:[{}]'.format(cg))
    src_dataset = TextFile([dataset[0]], src_vocab, None)
    trg_dataset = TextFile([dataset[1]], trg_vocab, None)
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    stream = Mapping(stream, _oov_to_unk(
                     src_vocab_size=config['src_vocab_sizes'][eid],
                     trg_vocab_size=config['trg_vocab_sizes'][did],
                     unk_id=config['unk_id']))
    bs = 100
    if 'log_prob_bs' in config:
        if isinstance(config['log_prob_bs'], dict):
            bs = config['log_prob_bs'][cg]
        else:
            bs = config['log_prob_bs']
    stream = Batch(
        stream,
        iteration_scheme=ConstantScheme(
            bs, num_examples=get_num_lines(dataset[0])))

    masked_stream = Padding(stream)
    masked_stream = Mapping(
        masked_stream, _remapWordIdx(
            [(0, 0, config['src_eos_idxs'][eid]),
             (2, 0, config['trg_eos_idxs'][did])]))

    return masked_stream
Beispiel #18
0
def get_sgnmt_tr_stream(src_data, trg_data,
                       src_vocab_size=30000, trg_vocab_size=30000,
                       unk_id=1, seq_len=50, batch_size=80, 
                       sort_k_batches=12, **kwargs):
    """Prepares the unshuffled training data stream. This corresponds 
    to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the
    blocks examples."""

    # Build dummy vocabulary to make TextFile happy
    src_vocab = add_special_ids({str(i) : i for i in xrange(src_vocab_size)})
    trg_vocab = add_special_ids({str(i) : i for i in xrange(trg_vocab_size)})

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    s = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    s = Filter(s, predicate=stream._too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    s = Mapping(s, stream._oov_to_unk(src_vocab_size=src_vocab_size,
                               trg_vocab_size=trg_vocab_size,
                               unk_id=utils.UNK_ID))

    # Build a batched version of stream to read k batches ahead
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    s = Mapping(s, SortMapping(stream._length))

    # Convert it into a stream again
    s = Unpack(s)

    # Construct batches from the stream with specified batch size
    s = Batch(s, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID])

    return masked_stream
Beispiel #19
0
def get_dev_stream_with_prefixes(val_set=None, val_set_grndtruth=None, src_vocab=None, src_vocab_size=30000,
                                 trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs):
    """Setup development set stream if necessary."""

    dev_stream = None
    if val_set is not None and val_set_grndtruth is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        trg_vocab = _ensure_special_tokens(
            trg_vocab if isinstance(trg_vocab, dict) else
            cPickle.load(open(trg_vocab)),
            bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

        dev_source_dataset = TextFile([val_set], src_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')
        dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab,
                                      bos_token='<S>',
                                      eos_token='</S>',
                                      unk_token='<UNK>')

        dev_stream = Merge([dev_source_dataset.get_example_stream(),
                            dev_target_dataset.get_example_stream()],
                           ('source', 'target'))

        # now add prefix and suffixes to this stream
        dev_stream = Mapping(dev_stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('dev_sample_ratio', 1.)),
                             add_sources=('target_prefix', 'target_suffix'))

        dev_stream = Mapping(dev_stream, CopySourceAndTargetToMatchPrefixes(dev_stream))

        # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
        dev_stream.produces_examples = False
        # flatten the stream back out into (source, target, target_prefix, target_suffix)
        dev_stream = Unpack(dev_stream)

    if return_vocab:
        return dev_stream, src_vocab, trg_vocab
    else:
        return dev_stream
Beispiel #20
0
def get_dev_tr_stream_with_topic_target(val_set_source=None,val_set_target=None, src_vocab=None,trg_vocab=None, src_vocab_size=30000,trg_vocab_size=30000,
                                        trg_topic_vocab_size=2000,source_topic_vocab_size=2000,
                                        topical_dev_set=None,topic_vocab_input=None,topic_vocab_output=None,topical_vocab_size=2000,
                   unk_id=1, **kwargs):
    """Prepares the training data stream."""

    dev_stream = None
    if val_set_source is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict)
            else cPickle.load(open(src_vocab, 'rb')),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
        trg_vocab = _ensure_special_tokens(
            trg_vocab if isinstance(trg_vocab, dict) else
            cPickle.load(open(trg_vocab, 'rb')),
            bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)
        topic_vocab_input=cPickle.load(open(topic_vocab_input,'rb'));
        topic_vocab_output=cPickle.load(open(topic_vocab_output, 'rb'));#already has <UNK> and </S> in it
        topic_binary_vocab={};
        for k,v in topic_vocab_output.items():
            if k=='<UNK>':
                topic_binary_vocab[k]=0;
            else:
                topic_binary_vocab[k]=1;
        # Get text files from both source and target
        src_dataset = TextFile([val_set_source], src_vocab, None)
        trg_dataset = TextFile([val_set_target], trg_vocab, None)
        src_topic_input=TextFile([topical_dev_set],topic_vocab_input,None,None,'rt')
        trg_topic_dataset = TextFile([val_set_target],topic_vocab_output,None);
        trg_topic_binary_dataset= TextFile([val_set_target],topic_binary_vocab,None);

        # Merge them to get a source, target pair
        dev_stream = Merge([src_dataset.get_example_stream(),
                        trg_dataset.get_example_stream(),
                        src_topic_input.get_example_stream(),
                        trg_topic_dataset.get_example_stream(),
                        trg_topic_binary_dataset.get_example_stream()],
                       ('source', 'target','source_topical','target_topic','target_binary_topic'))
        stream = Batch(
        dev_stream, iteration_scheme=ConstantScheme(1))
        masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1,trg_vocab_size - 1, source_topic_vocab_size-1,trg_topic_vocab_size - 1,trg_topic_vocab_size-1])

    return masked_stream
Beispiel #21
0
def get_dev_stream(sfiles, tfiles, svocab_dict, tvocab_dict):

	s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')
	t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\
		unk_token='<unk>', level='word', preprocess=None, encoding='utf8')

	# Merge 
	stream = Merge([s_dataset.get_example_stream(),
                    t_dataset.get_example_stream()],
                   ('source', 'target'))
	# Batch - Sort 
	stream = Batch(stream, 
		iteration_scheme=ConstantScheme(1006))
	# Pad 
	# Note that </s>=0. Fuel only allows padding 0 by default 
	masked_stream = Padding(stream)

	return masked_stream
Beispiel #22
0
def get_stream(vocab, data, vocab_size, unk_id, eos_id, bos_id, noise=0):
    vocab = get_vocab(vocab, vocab_size, unk_id, eos_id, bos_id)

    # Maps words to their index in the vocabulary. OOV words are replaced by <UNK> index.
    # Also appends </S> index at the end. No <S> token (TODO: bos_id parameter useless).
    dataset = TextFile([data], vocab, None)

    stream = Mapping(dataset.get_example_stream(), _add_noise(noise))
    stream.dataset = dataset  # for backward-compatibility
    return stream
Beispiel #23
0
def get_test_stream(test_set=None,
                    src_vocab=None,
                    trg_vocab=None,
                    src_vocab_size=200000,
                    trg_vocab_size=6540,
                    unk_id=1,
                    sort_k_batches=12):
    """Prepares the testing data stream."""
    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                       bos_idx=0,
                                       eos_idx=src_vocab_size - 1,
                                       unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
        trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')),
                                       bos_idx=0,
                                       eos_idx=trg_vocab_size - 1,
                                       unk_idx=unk_id)
    # Get text files from both source and target
    src_dataset = TextFile([test_set], src_vocab, None)
    trg_dataset = TextFile(['./data/test.zh'], trg_vocab, None)
    # Merge them to get a source, target pair
    stream = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))
    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream, _oov_to_unk())
    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream, iteration_scheme=ConstantScheme(sort_k_batches))
    # Convert it into a stream again
    stream = Unpack(stream)
    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(1))
    # Pad sequences that are short
    masked_stream = PaddingWithEOS(stream,
                                   [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream
Beispiel #24
0
def _get_text_stream(src_data,
                     trg_data,
                     src_vocab_size=30000,
                     trg_vocab_size=30000,
                     **kwargs):
    """Creates a parallel data stream from two text files without 
    random access. This stream cannot be used with reshuffling.
    
    The arguments to this method are given by the configuration dict.
    """

    # Build dummy vocabulary to make TextFile happy
    src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)})
    trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)})

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    return Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))
Beispiel #25
0
def get_dev_stream_withContext_withPosTag(test_ctx_datas=None,
                                          test_posTag_datas=None,
                                          test_set_source=None,
                                          src_vocab=None,
                                          src_vocab_size=30000,
                                          unk_id=1,
                                          ctx_num=3,
                                          **kwargs):
    """Setup development set stream if necessary."""
    dev_stream = None
    if test_set_source is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        print test_set_source, type(src_vocab)
        # Get text files from both source and target
        ctx_datasets = []
        posTag_datasets = []
        for i in range(ctx_num):
            ctx_datasets.append(TextFile([test_ctx_datas[i]], src_vocab, None))
            posTag_datasets.append(
                TextFile([test_posTag_datas[i]], src_vocab, None))
        posTag_datasets.append(
            TextFile([test_posTag_datas[ctx_num]], src_vocab, None))
        src_dataset = TextFile([test_set_source], src_vocab, None)

        # Merge them to get a source, target pair
        dev_stream = Merge(
            [i.get_example_stream() for i in ctx_datasets] +
            [i.get_example_stream()
             for i in posTag_datasets] + [src_dataset.get_example_stream()],
            tuple('context_' + str(i) for i in range(ctx_num)) +
            tuple('context_posTag_' + str(i)
                  for i in range(ctx_num)) + ('source_posTag', 'source'))

        stream = Mapping(
            dev_stream,
            _oov_to_unk_posTag_dev(ctx_num=ctx_num,
                                   src_vocab_size=src_vocab_size,
                                   unk_id=unk_id))

        # Build a batched version of stream to read k batches ahead
        stream = Batch(stream, iteration_scheme=ConstantScheme(1))
        masked_stream = PaddingWithEOSContext(
            stream, [src_vocab_size - 1 for i in range(2 * ctx_num + 2)])

    return masked_stream
Beispiel #26
0
def get_textfile_stream(source_file=None,
                        src_vocab=None,
                        src_vocab_size=30000,
                        unk_id=1,
                        bos_token=None):
    """Create a TextFile dataset from a single text file, and return a stream"""
    if type(bos_token) is str:
        bos_token = bos_token.decode('utf8')

    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else cPickle.load(open(src_vocab)),
                                       bos_idx=0,
                                       eos_idx=src_vocab_size - 1,
                                       unk_idx=unk_id)
    source_dataset = TextFile([source_file],
                              src_vocab,
                              bos_token=bos_token,
                              eos_token=u'</S>',
                              unk_token=u'<UNK>',
                              encoding='utf8')
    source_stream = source_dataset.get_example_stream()
    return source_stream
Beispiel #27
0
Globals.read_alphabet(f_vocab)

#print("Vocab:",Globals.char2code, file=sys.stderr);

Globals.read_lookup(f_train)

m = MorphGen(100, len(Globals.char2code))

dataset_options = dict(dictionary=Globals.char2code,
                       level="word",
                       preprocess=_tokenise)

dataset = TextFile([f_train], **dataset_options)

data_stream = dataset.get_example_stream()

# Read examples and look up the right surface form
data_stream = Mapping(data_stream, morph_lookup, add_sources=("targets", ))

# Read in 10 samples at a time
data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))

# Pad the examples
data_stream = Padding(data_stream)
data_stream = Mapping(data_stream, _transpose)

# Initialisation settings

m.weights_init = IsotropicGaussian(0.1)
m.biases_init = Constant(0.0)
Beispiel #28
0
def get_tr_stream(src_vocab,
                  trg_vocab,
                  src_data,
                  trg_data,
                  dict_data,
                  src_vocab_size=30000,
                  trg_vocab_size=30000,
                  unk_id=1,
                  seq_len=50,
                  batch_size=80,
                  sort_k_batches=12,
                  **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    '''
    actual_src_vocab_num = len(src_vocab)
    actual_trg_vocab_num = len(trg_vocab)
    src_vocab = ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=(actual_src_vocab_num - 1) if
	actual_src_vocab_num - 3 <
	src_vocab_size else (src_vocab_size + 3 -
		1), unk_idx=unk_id)
    trg_vocab = ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=(actual_trg_vocab_num - 1) if
	actual_trg_vocab_num - 3 < trg_vocab_size else
	(trg_vocab_size + 3 - 1), unk_idx=unk_id)
    '''

    src_vocab = ensure_special_tokens(src_vocab if isinstance(src_vocab, dict)
                                      else cPickle.load(open(src_vocab)),
                                      bos_idx=0,
                                      eos_idx=src_vocab_size - 1,
                                      unk_idx=unk_id)
    trg_vocab = ensure_special_tokens(trg_vocab if isinstance(trg_vocab, dict)
                                      else cPickle.load(open(trg_vocab)),
                                      bos_idx=0,
                                      eos_idx=trg_vocab_size - 1,
                                      unk_idx=unk_id)

    # for example:
    # source: 第五 章 罚则
    # target: chapter v penalty regulations
    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab)
    trg_dataset = TextFile([trg_data], trg_vocab)
    dict_dataset = TextFile([dict_data], trg_vocab)
    # for data in DataStream(src_dataset).get_epoch_iterator():
    #    print(data)     # looks like: ([0, 1649, 1764, 7458, 29999],)

    # Merge them to get a source, target pair
    stream = Merge([
        src_dataset.get_example_stream(),
        trg_dataset.get_example_stream(),
        dict_dataset.get_example_stream()
    ], ('source', 'target',
        'dict'))  # data_stream.sources = 'source' or 'target'
    '''
    print 'init \n'
    num_before_filter = 0
    for data in stream.get_epoch_iterator():
        num_before_filter = num_before_filter + 1
        # print(data)
    '''
    # looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999])

    # Filter sequences that are too long
    # Neither source sentence or target sentence can beyond the length seq_len
    # the lenght include the start symbol <s> and the end symbol </s>, so the actual sentence
    # length can not beyond (seq_len - 2)
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))
    '''
    num_after_filter = 0
    # print 'after filter ... \n'
    for data in stream.get_epoch_iterator():
        num_after_filter = num_after_filter + 1
        # print(data)

    logger.info('\tby filtering, sentence-pairs from {} to {}.'.format(num_before_filter, num_after_filter))
    logger.info('\tfilter {} sentence-pairs whose source or target sentence exceeds {} words'.format(
        (num_before_filter - num_after_filter), seq_len))
    '''

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))  # do not need
    '''
    print 'after mapping unk ...'
    for data in stream.get_epoch_iterator():
        print(data)
    '''

    # still looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999])
    # Build a batched version of stream to read k batches ahead
    # do not sort on the whole training data, first split the training data into several blocks,
    # each block contain (batch_size*sort_k_batches) sentence-pairs, we juse sort in each block,
    # finally, i understand !!!!!!!
    # remainder
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))
    '''
    print 'after sorted batch ... '
    for data in stream.get_epoch_iterator():
        print(data)
    '''

    # Sort all samples in the read-ahead batch
    # sort by the length of target sentence in (batch_size*sort_k_batches)
    # list for all training data, speed up
    stream = Mapping(stream, SortMapping(_length))
    '''
    print 'after sort ... '
    for data in stream.get_epoch_iterator():
        print(data)
    '''

    # Convert it into a stream again
    stream = Unpack(stream)
    '''
    print 'after unpack ... '
    for data in stream.get_epoch_iterator():
        print(data)
    '''
    # still looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999])

    # remove the remainder ?
    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # after sort, each batch has batch_size sentence pairs
    '''
    print 'after final batch ... '
    i = 0
    for data in stream.get_epoch_iterator():
        i = i + 1
        print(data)
    print 'batchs: ', i
    '''

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream,
        bos_idx=[0, 0, 0],
        eos_idx=[src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1])
    # print 'after padding with mask ...'
    return masked_stream
Beispiel #29
0
def get_tr_stream(src_vocab,
                  trg_vocab,
                  src_data,
                  trg_data,
                  src_vocab_size=30000,
                  trg_vocab_size=30000,
                  unk_id=1,
                  seq_len=50,
                  batch_size=80,
                  sort_k_batches=12,
                  bos_token=None,
                  **kwargs):
    """Prepares the training data stream."""
    if type(bos_token) is str:
        bos_token = bos_token.decode('utf8')

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else cPickle.load(open(src_vocab)),
                                       bos_idx=0,
                                       eos_idx=src_vocab_size - 1,
                                       unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
        trg_vocab, dict) else cPickle.load(open(trg_vocab)),
                                       bos_idx=0,
                                       eos_idx=trg_vocab_size - 1,
                                       unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data],
                           src_vocab,
                           bos_token=bos_token,
                           eos_token=u'</S>',
                           unk_token=u'<UNK>',
                           encoding='utf8')
    trg_dataset = TextFile([trg_data],
                           trg_vocab,
                           bos_token=bos_token,
                           eos_token=u'</S>',
                           unk_token=u'<UNK>',
                           encoding='utf8')

    # Merge them to get a source, target pair
    stream = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    stream = Mapping(
        stream,
        _oov_to_unk(src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    unk_id=unk_id))

    # Now make a very big batch that we can shuffle
    shuffle_batch_size = kwargs.get('shuffle_batch_size', 1000)
    stream = Batch(stream, iteration_scheme=ConstantScheme(shuffle_batch_size))

    stream = ShuffleBatchTransformer(stream)

    # unpack it again
    stream = Unpack(stream)
    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(stream,
                                   [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream, src_vocab, trg_vocab
Beispiel #30
0
def get_logprob_streams(config):
    if 'log_prob_sets' not in config:
        return None

    cgs = config['cgs']
    enc_ids, dec_ids = get_enc_dec_ids(cgs)
    datasets = config['log_prob_sets']

    # Prepare source vocabs and files, make sure special tokens are there
    src_vocabs = {
        k: cPickle.load(open(v))
        for k, v in config['src_vocabs'].iteritems()
    }
    for k in src_vocabs.keys():
        src_vocabs[k]['<S>'] = 0
        src_vocabs[k]['</S>'] = config['src_eos_idxs'][k]
        src_vocabs[k]['<UNK>'] = config['unk_id']

    # Prepare target vocabs and files, make sure special tokens are there
    trg_vocabs = {
        k: cPickle.load(open(v))
        for k, v in config['trg_vocabs'].iteritems()
    }
    for k in trg_vocabs.keys():
        trg_vocabs[k]['<S>'] = 0
        trg_vocabs[k]['</S>'] = config['trg_eos_idxs'][k]
        trg_vocabs[k]['<UNK>'] = config['unk_id']

    # Build the preprocessing pipeline for individual streams
    ind_streams = {}
    for cg in cgs:
        eid, did = p_(cg)
        if cg not in datasets:
            continue
        logger.info('Building logprob stream for cg:[{}]'.format(cg))
        src_dataset = TextFile([datasets[cg][0]], src_vocabs[p_(cg)[0]], None)
        trg_dataset = TextFile([datasets[cg][1]], trg_vocabs[p_(cg)[1]], None)
        stream = Merge([
            src_dataset.get_example_stream(),
            trg_dataset.get_example_stream()
        ], ('source', 'target'))

        stream = Mapping(
            stream,
            _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid],
                        trg_vocab_size=config['trg_vocab_sizes'][did],
                        unk_id=config['unk_id']))
        bs = 100
        if 'log_prob_bs' in config:
            if isinstance(config['log_prob_bs'], dict):
                bs = config['log_prob_bs'][cg]
            else:
                bs = config['log_prob_bs']

        stream = Batch(stream, iteration_scheme=ConstantScheme(bs))

        masked_stream = Padding(stream)
        masked_stream = Mapping(
            masked_stream,
            _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]),
                           (2, 0, config['trg_eos_idxs'][did])]))
        ind_streams[cg] = masked_stream

    return ind_streams
Beispiel #31
0
def get_tr_stream_with_context_features(src_vocab, trg_vocab, src_data, trg_data, context_features,
                                        src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                                        seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    def _get_np_array(filename):
        return numpy.load(filename)['arr_0']

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))


    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # now add the source with the image features
    # create the image datastream (iterate over a file line-by-line)
    train_features = _get_np_array(context_features)
    train_feature_dataset = IterableDataset(train_features)
    train_image_stream = DataStream(train_feature_dataset)

    stream = Merge([stream, train_image_stream], ('source', 'target', 'initial_context'))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1], mask_sources=('source', 'target'))

    return masked_stream, src_vocab, trg_vocab
Beispiel #32
0
def get_tr_stream_with_prefixes(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000,
                                trg_vocab_size=30000, unk_id=1, seq_len=50,
                                batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the IMT training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # TODO: should training stream actually have begin and end tokens?
    # Note: this actually depends upon how the system was pre-trained, but systems used for initialization
    # Note: should _always_ have BOS tokens

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab,
                           bos_token='<S>',
                           eos_token='</S>',
                           unk_token='<UNK>')
    trg_dataset = TextFile([trg_data], trg_vocab,
                           bos_token='<S>',
                           eos_token='</S>',
                           unk_token='<UNK>')

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    stream = Mapping(stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('train_sample_ratio', 1.)),
                     add_sources=('target_prefix', 'target_suffix'))

    stream = Mapping(stream, CopySourceAndTargetToMatchPrefixes(stream))

    # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
    stream.produces_examples = False
    # flatten the stream back out into (source, target, target_prefix, target_suffix)
    stream = Unpack(stream)

    # Now make a very big batch that we can shuffle
    shuffle_batch_size = kwargs['shuffle_batch_size']
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(shuffle_batch_size)
                   )

    stream = ShuffleBatchTransformer(stream)

    # unpack it again
    stream = Unpack(stream)

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size * sort_k_batches)
                   )

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    # TODO: is it correct to blindly pad the target_prefix and the target_suffix?
    configurable_padding_args = {
        'suffix_length': kwargs.get('suffix_length', None),
        'truncate_sources': kwargs.get('truncate_sources', [])
    }
    logger.info('Training suffix length is: {}'.format(configurable_padding_args['suffix_length']))
    logger.info('I will mask the following sources after <suffix_length>: {}'.format(configurable_padding_args['truncate_sources']))
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1],
        mask_sources=('source', 'target', 'target_prefix', 'target_suffix'), **configurable_padding_args)

    return masked_stream, src_vocab, trg_vocab
Beispiel #33
0
def get_tr_stream_with_topic_target(src_vocab, trg_vocab,topic_vocab_input,topic_vocab_output, src_data, trg_data,topical_data,
                  src_vocab_size=30000, trg_vocab_size=30000,trg_topic_vocab_size=2000,source_topic_vocab_size=2000, unk_id=1,
                  seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist

    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab, 'rb')),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab, 'rb')),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)
    topic_vocab_input=cPickle.load(open(topic_vocab_input,'rb'));
    topic_vocab_output=cPickle.load(open(topic_vocab_output, 'rb'));#already has <UNK> and </S> in it
    topic_binary_vocab={};
    for k,v in topic_vocab_output.items():
        if k=='<UNK>':
            topic_binary_vocab[k]=0;
        else:
            topic_binary_vocab[k]=1;


    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)
    src_topic_input=TextFile([topical_data],topic_vocab_input,None,None,'rt')
    trg_topic_dataset = TextFile([trg_data],topic_vocab_output,None);
    trg_topic_binary_dataset= TextFile([trg_data],topic_binary_vocab,None);

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream(),
                    src_topic_input.get_example_stream(),
                    trg_topic_dataset.get_example_stream(),
                    trg_topic_binary_dataset.get_example_stream()],
                   ('source', 'target','source_topical','target_topic','target_binary_topic'))


    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    # The topical part are not contained of it, check~
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 src_topic_vocab_size=source_topic_vocab_size,
                                 trg_topic_vocab_size=trg_topic_vocab_size,
                                 unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1,trg_vocab_size - 1, source_topic_vocab_size-1,trg_topic_vocab_size - 1,trg_topic_vocab_size-1])

    return masked_stream
def main(config, tr_stream, dev_stream, use_bokeh=False):
    print("~def main")

    # Create Theano variables
    logger.info('Creating theano variables')
    source_sentence = tensor.lmatrix('source')
    source_sentence_mask = tensor.matrix('source_mask')
    target_sentence = tensor.lmatrix('target')
    target_sentence_mask = tensor.matrix('target_mask')
    sampling_input = tensor.lmatrix('input')

    print("~sampling_input = tensor.lmatrix")


    # Construct model
    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(
        config['src_vocab_size'], config['enc_embed'], config['enc_nhids'])
    decoder = Decoder(
        config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'],
        config['enc_nhids'] * 2)
    cost = decoder.cost(
        encoder.apply(source_sentence, source_sentence_mask),
        source_sentence_mask, target_sentence, target_sentence_mask)

    print("~source_sentence_mask, target_sentence, target_sentence_mask")

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    print("~ComputationGraph")

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    encoder.bidir.prototype.weights_init = Orthogonal()
    decoder.transition.weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()


    print("~decoder.initialize()")



    # apply dropout for regularization
    if config['dropout'] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info('Applying dropout')
        dropout_inputs = [x for x in cg.intermediary_variables
                          if x.name == 'maxout_apply_output']
        cg = apply_dropout(cg, dropout_inputs, config['dropout'])



    print("~cg = apply_dropout")

    # Apply weight noise for regularization
    if config['weight_noise_ff'] > 0.0:
        logger.info('Applying weight noise to ff layers')
        enc_params = Selector(encoder.lookup).get_params().values()
        enc_params += Selector(encoder.fwd_fork).get_params().values()
        enc_params += Selector(encoder.back_fork).get_params().values()
        dec_params = Selector(
            decoder.sequence_generator.readout).get_params().values()
        dec_params += Selector(
            decoder.sequence_generator.fork).get_params().values()
        dec_params += Selector(decoder.state_init).get_params().values()
        cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'])


    print("~cg = apply_noise")

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    print("~logger.info")



    # Print parameter names
    enc_dec_param_dict = merge(Selector(encoder).get_parameters(),
                               Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}"
                .format(len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)
    print("~training_model")


    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config['finish_after']),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config['saveto'],
                      every_n_batches=config['save_freq'])
    ]
    print("~every_n_batches=config")

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        logger.info("Building sampling model")
        sampling_representation = encoder.apply(
            sampling_input, tensor.ones(sampling_input.shape))
        generated = decoder.generate(sampling_input, sampling_representation)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[1]))  # generated[1] is next_outputs

    sample = Sampler(model=search_model, data_stream=tr_stream,
                hook_samples=config['hook_samples'],
                every_n_batches=config['sampling_freq'],
                src_vocab_size=config['src_vocab_size'])

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append( sample )

    # Add early stopping based on bleu
    if config['bleu_script'] is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(sampling_input, samples=samples, config=config,
                          model=search_model, data_stream=dev_stream,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Plot cost in bokeh if necessary
    if use_bokeh and BOKEH_AVAILABLE:
        extensions.append(
            Plot('Cs-En', channels=[['decoder_cost_cost']],
                 after_batch=True))










    sampling_fn = search_model.get_theano_function()



    print(" - - - - - - - - - - - - - - "  )


    sort_k_batches = 12
    batch_size = 80
    seq_len = 50
    trg_ivocab = None
    src_vocab_size = config['src_vocab_size']
    trg_vocab_size = config['trg_vocab_size']
    unk_id = config['unk_id'] 

    src_vocab = config['src_vocab']
    trg_vocab = config['trg_vocab']
    src_vocab = ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)
    if not trg_ivocab:
        trg_ivocab = {v: k for k, v in trg_vocab.items()}


    src_data = config['src_data']
    trg_data = config['trg_data']
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)




    inputstringfile="inputstringfile.cs"
    input_dataset = TextFile([inputstringfile], src_vocab, None)







    stream = Merge([input_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))
    stream2 = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))
    stream3 = Mapping(stream2,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))
    stream4 = Batch(stream3,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))
                       
    stream5 = Mapping(stream4, SortMapping(_length))
    stream6 = Unpack(stream5)
    stream7 = Batch(
        stream6, iteration_scheme=ConstantScheme(batch_size))

    input_stream = DataStream(input_dataset)





    print("dev_stream : ", type( dev_stream )   )
    print("input_stream : ",  type( input_stream )   )






    epochone = input_stream.get_epoch_iterator() 
    vocab = input_stream.dataset.dictionary
    unk_sym = input_stream.dataset.unk_token
    eos_sym = input_stream.dataset.eos_token




    for i, line in enumerate(epochone):
        seq = oov_to_unk(
            line[0], config['src_vocab_size'], unk_id)
        input_ = numpy.tile(seq, ( 1 , 1))


        print("seq : " ,   type( seq )  ,  seq   )
        print("input_ : ", type( input_ )  , input_ ,  inspect.getmembers( input_ )    )



        _1, outputs, _2, _3, costs = ( sampling_fn(  input_  ) )

        outputs = outputs.flatten()
        costs = costs.T

        print(" outputs : "    ,   outputs   ,   type( outputs )  )
        print("idx_to_word: ", idx_to_word(outputs  ,  trg_ivocab))












    print(" - - - - - - - - - - - - - - "  )
Beispiel #35
0
    def __init__(self, seq_len=50):
        self.seq_len = seq_len

    def __call__(self, sentence_pair):
        return all([len(sentence) <= self.seq_len
                    for sentence in sentence_pair])

fi_vocab = config['src_vocab']
en_vocab = config['trg_vocab']
fi_file = config['src_data']
en_file = config['trg_data']

fi_dataset = TextFile([fi_file], cPickle.load(open(fi_vocab)), None)
en_dataset = TextFile([en_file], cPickle.load(open(en_vocab)), None)

stream = Merge([fi_dataset.get_example_stream(),
                en_dataset.get_example_stream()],
               ('source', 'target'))

stream = Filter(stream, predicate=_too_long(config['seq_len']))
stream = Mapping(stream, _oov_to_unk(
                 src_vocab_size=config['src_vocab_size'],
                 trg_vocab_size=config['trg_vocab_size'],
                 unk_id=config['unk_id']))
stream = Batch(stream,
               iteration_scheme=ConstantScheme(
                   config['batch_size']*config['sort_k_batches']))

stream = Mapping(stream, SortMapping(_length))
stream = Unpack(stream)
stream = Batch(stream, iteration_scheme=ConstantScheme(config['batch_size']))
Beispiel #36
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code, level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream, reverse_words,
                              add_sources=("targets",))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weghts_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(
            chars, chars_mask, targets, targets_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost,  batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        params = model.get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in params.items()],
                        width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(
            cost=cost, params=cg.parameters,
            step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies,) = VariableFilter(
            application=generator.readout.readout,
            name="output")(cg.variables)
        (activations,) = VariableFilter(
            application=generator.transition.apply,
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        mean_activation = named_copy(abs(activations).mean(),
                                     "mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation,
            batch_size, max_length, cost_per_character,
            algorithm.total_step_norm, algorithm.total_gradient_norm]
        for name, param in params.items():
            observables.append(named_copy(
                param.norm(2), name + "_norm"))
            observables.append(named_copy(
                algorithm.gradients[param].norm(2), name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(
            observables, prefix="average", every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition("after_batch", _is_nan),
                Plot(os.path.basename(save_path),
                     [[average_monitoring.record_name(cost)],
                      [average_monitoring.record_name(cost_per_character)]],
                     every_n_batches=10),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path, every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_param_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    bricks=[reverser.generator], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(input_.shape[1], samples)
                outputs, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            line = input("Enter a sentence\n")
            message = ("Enter the number of samples\n" if mode == "sample"
                       else "Enter the beam size\n")
            batch_size = int(input(message))

            encoded_input = [char2code.get(char, char2code["<UNK>"])
                             for char in line.lower().strip()]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input,))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size, axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
Beispiel #37
0
    def __call__(self, sentence_pair):
        return all(
            [len(sentence) <= self.seq_len for sentence in sentence_pair])


fi_vocab = config['src_vocab']
en_vocab = config['trg_vocab']
fi_file = config['src_data']
en_file = config['trg_data']

fi_dataset = TextFile([fi_file], cPickle.load(open(fi_vocab)), None)
en_dataset = TextFile([en_file], cPickle.load(open(en_vocab)), None)

stream = Merge(
    [fi_dataset.get_example_stream(),
     en_dataset.get_example_stream()], ('source', 'target'))

stream = Filter(stream, predicate=_too_long(config['seq_len']))
stream = Mapping(
    stream,
    _oov_to_unk(src_vocab_size=config['src_vocab_size'],
                trg_vocab_size=config['trg_vocab_size'],
                unk_id=config['unk_id']))
stream = Batch(stream,
               iteration_scheme=ConstantScheme(config['batch_size'] *
                                               config['sort_k_batches']))

stream = Mapping(stream, SortMapping(_length))
stream = Unpack(stream)
stream = Batch(stream, iteration_scheme=ConstantScheme(config['batch_size']))
Beispiel #38
0
def get_src_trg_stream(cg,
                       config,
                       src_datasets=None,
                       trg_datasets=None,
                       is_training=True,
                       src_vocabs=None,
                       trg_vocabs=None,
                       logprob_datasets=None):
    eid, did = p_(cg)
    if is_training:
        logger.info(' ... src:[{}] - [{}]'.format(eid,
                                                  src_datasets[cg].files[0]))
        logger.info(' ... trg:[{}] - [{}]'.format(did,
                                                  trg_datasets[cg].files[0]))
        stream = Merge([
            src_datasets[cg].get_example_stream(),
            trg_datasets[cg].get_example_stream()
        ], ('source', 'target'))
        stream = Filter(stream, predicate=_too_long(config['seq_len']))

        if 'min_seq_lens' in config and config['min_seq_lens'][cg] > 0:
            stream = Filter(stream,
                            predicate=_too_short(config['min_seq_lens'][cg]))

        stream = Mapping(
            stream,
            _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid],
                        trg_vocab_size=config['trg_vocab_sizes'][did],
                        unk_id=config['unk_id']))
        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(config['batch_sizes'][cg] *
                                            config['sort_k_batches']))

        stream = Mapping(stream, SortMapping(_length))
        stream = Unpack(stream)
        stream = Batch(stream,
                       iteration_scheme=ConstantScheme(
                           config['batch_sizes'][cg]))
    else:  # logprob stream
        src_dataset = TextFile([logprob_datasets[cg][0]],
                               src_vocabs[p_(cg)[0]], None)
        trg_dataset = TextFile([logprob_datasets[cg][1]],
                               trg_vocabs[p_(cg)[1]], None)
        stream = Merge([
            src_dataset.get_example_stream(),
            trg_dataset.get_example_stream()
        ], ('source', 'target'))
        stream = Mapping(
            stream,
            _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid],
                        trg_vocab_size=config['trg_vocab_sizes'][did],
                        unk_id=config['unk_id']))
        bs = 100
        if 'log_prob_bs' in config:
            if isinstance(config['log_prob_bs'], dict):
                bs = config['log_prob_bs'][cg]
            else:
                bs = config['log_prob_bs']
        stream = Batch(stream, iteration_scheme=ConstantScheme(bs))

    masked_stream = Padding(stream)
    masked_stream = Mapping(
        masked_stream,
        _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]),
                       (2, 0, config['trg_eos_idxs'][did])]))
    return masked_stream
Beispiel #39
0
def main(mode, save_path, steps, num_batches, load_params):
    chars = (list(string.ascii_uppercase) + list(range(10)) +
             [' ', '.', ',', '\'', '"', '!', '?', '<UNK>'])
    char_to_ind = {char: i for i, char in enumerate(chars)}
    ind_to_char = {v: k for k, v in char_to_ind.iteritems()}

    train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'],
                             char_to_ind, bos_token=None, eos_token=None,
                             level='character')
    valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'],
                             char_to_ind, bos_token=None, eos_token=None,
                             level='character')

    vocab_size = len(char_to_ind)
    logger.info('Dictionary size: {}'.format(vocab_size))
    if mode == 'continue':
        continue_training(save_path)
        return
    elif mode == "sample":
        main_loop = load(open(save_path, "rb"))
        generator = main_loop.model.get_top_bricks()[-1]

        sample = ComputationGraph(generator.generate(
            n_steps=steps, batch_size=1, iterate=True)).get_theano_function()

        states, outputs, costs = [data[:, 0] for data in sample()]
        print("".join([ind_to_char[s] for s in outputs]))

        numpy.set_printoptions(precision=3, suppress=True)
        print("Generation cost:\n{}".format(costs.sum()))

        freqs = numpy.bincount(outputs).astype(floatX)
        freqs /= freqs.sum()

        trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX)
        for a, b in zip(outputs, outputs[1:]):
            trans_freqs[a, b] += 1
        trans_freqs /= trans_freqs.sum(axis=1)[:, None]
        return

    # Experiment configuration
    batch_size = 20
    dim = 650
    feedback_dim = 650

    valid_stream = valid_dataset.get_example_stream()
    valid_stream = Batch(valid_stream,
                         iteration_scheme=ConstantScheme(batch_size))
    valid_stream = Padding(valid_stream)
    valid_stream = Mapping(valid_stream, _transpose)

    # Build the bricks and initialize them

    transition = GatedRecurrent(name="transition", dim=dim,
                                activation=Tanh())
    generator = SequenceGenerator(
        Readout(readout_dim=vocab_size, source_names=transition.apply.states,
                emitter=SoftmaxEmitter(name="emitter"),
                feedback_brick=LookupFeedback(
                    vocab_size, feedback_dim, name='feedback'),
                name="readout"),
        transition,
        weights_init=Uniform(std=0.04), biases_init=Constant(0),
        name="generator")
    generator.push_initialization_config()
    transition.weights_init = Orthogonal()
    transition.push_initialization_config()
    generator.initialize()

    # Build the cost computation graph.
    features = tensor.lmatrix('features')
    features_mask = tensor.matrix('features_mask')
    cost_matrix = generator.cost_matrix(
        features, mask=features_mask)
    batch_cost = cost_matrix.sum()
    cost = aggregation.mean(
        batch_cost,
        features.shape[1])
    cost.name = "sequence_log_likelihood"
    char_cost = aggregation.mean(
        batch_cost, features_mask.sum())
    char_cost.name = 'character_log_likelihood'
    ppl = 2 ** (cost / numpy.log(2))
    ppl.name = 'ppl'
    bits_per_char = char_cost / tensor.log(2)
    bits_per_char.name = 'bits_per_char'
    length = features.shape[0]
    length.name = 'length'

    model = Model(batch_cost)
    if load_params:
        params = load_parameter_values(save_path)
        model.set_parameter_values(params)

    if mode == "train":
        # Give an idea of what's going on.
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in Selector(generator).get_parameters().items()],
                        width=120))

        train_stream = train_dataset.get_example_stream()
        train_stream = Mapping(train_stream, _truncate)
        train_stream = Batch(train_stream,
                             iteration_scheme=ConstantScheme(batch_size))
        train_stream = Padding(train_stream)
        train_stream = Mapping(train_stream, _transpose)

        parameters = model.get_parameter_dict()
        maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values())
        algorithm = GradientDescent(
            cost=batch_cost,
            parameters=parameters.values(),
            step_rule=CompositeRule([StepClipping(1000.), 
                AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects)
                                     ]))
        ft = features[:6, 0]
        ft.name = 'feature_example'

        observables = [cost, ppl, char_cost, length, bits_per_char]
        for name, param in parameters.items():
            num_elements = numpy.product(param.get_value().shape)
            norm = param.norm(2) / num_elements ** 0.5
            grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5
            step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5
            stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm)
            stats.name = name + '_stats'
            observables.append(stats)
        track_the_best_bpc = TrackTheBest('valid_bits_per_char')
        root_path, extension = os.path.splitext(save_path)

        this_step_monitoring = TrainingDataMonitoring(
            observables + [ft], prefix="this_step", after_batch=True)
        average_monitoring = TrainingDataMonitoring(
            observables + [algorithm.total_step_norm,
                           algorithm.total_gradient_norm], 
            prefix="average",
            every_n_batches=10)
        valid_monitoring = DataStreamMonitoring(
            observables, prefix="valid",
            every_n_batches=1500, before_training=False,
            data_stream=valid_stream)
        main_loop = MainLoop(
            algorithm=algorithm,
            data_stream=train_stream,
            model=model,
            extensions=[
                this_step_monitoring,
                average_monitoring,
                valid_monitoring,
                track_the_best_bpc,
                Checkpoint(save_path, ),
                Checkpoint(save_path,
                           every_n_batches=500,
                           save_separately=["model", "log"],
                           use_cpickle=True)
                    .add_condition(
                    ['after_epoch'],
                    OnLogRecord(track_the_best_bpc.notification_name),
                    (root_path + "_best" + extension,)),
                Timing(after_batch=True),
                Printing(every_n_batches=10),
                Plot(root_path,
                     [[average_monitoring.record_name(cost),
                       valid_monitoring.record_name(cost)],
                      [average_monitoring.record_name(algorithm.total_step_norm)],
                      [average_monitoring.record_name(algorithm.total_gradient_norm)],
                      [average_monitoring.record_name(ppl),
                       valid_monitoring.record_name(ppl)],
                      [average_monitoring.record_name(char_cost),
                       valid_monitoring.record_name(char_cost)],
                      [average_monitoring.record_name(bits_per_char),
                       valid_monitoring.record_name(bits_per_char)]],
                     every_n_batches=10)
            ])
        main_loop.run()

    elif mode == 'evaluate':
        with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f:
            raw_words = [line.split()[1:-1] for line in f.readlines()]
            words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] 
                     for w in raw_words]
        max_word_length = max([len(w) for w in words])
        
        initial_states = tensor.matrix('init_states')
        cost_matrix_step = generator.cost_matrix(features, mask=features_mask,
                                                 states=initial_states)
        cg = ComputationGraph(cost_matrix_step)
        states = cg.auxiliary_variables[-2]
        compute_cost = theano.function([features, features_mask, initial_states], 
                                       [cost_matrix_step.sum(axis=0), states])

        cost_matrix = generator.cost_matrix(features, mask=features_mask)
        initial_cg = ComputationGraph(cost_matrix)
        initial_states = initial_cg.auxiliary_variables[-2]

        total_word_cost = 0
        num_words = 0
        examples = numpy.zeros((max_word_length + 1, len(words)),
                               dtype='int64')
        all_masks = numpy.zeros((max_word_length + 1, len(words)),
                                dtype=floatX)

        for i, word in enumerate(words):
            examples[:len(word), i] = word
            all_masks[:len(word), i] = 1.

        single_space = numpy.array([char_to_ind[' ']])[:, None]

        for batch in valid_stream.get_epoch_iterator():
            for example, mask in equizip(batch[0].T, batch[1].T):
                example = example[:(mask.sum())]
                spc_inds = list(numpy.where(example == char_to_ind[" "])[0])
                state = generator.transition.transition.initial_states_.get_value()[None, :]
                for i, j in equizip([-1] + spc_inds, spc_inds + [-1]):
                    word = example[(i+1):j, None]
                    word_cost, states = compute_cost(
                        word, numpy.ones_like(word, dtype=floatX), state)
                    state = states[-1]

                    costs = numpy.exp(-compute_cost(
                        examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0])

                    _, space_states = compute_cost(
                        single_space, numpy.ones_like(single_space, dtype=floatX), state)
                    state = space_states[-1]

                    word_prob = numpy.exp(-word_cost)
                    total_word_cost += word_cost + numpy.log(numpy.sum(costs))
                    num_words += 1
                    print(word_prob)
                    print(numpy.sum(costs))
                    print("Average cost", total_word_cost / num_words)
                    print("PPL", numpy.exp(total_word_cost / num_words))

        print("Word-level perplexity")
        print(total_word_cost / num_words)
    else:
        assert False
Beispiel #40
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code, level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream, reverse_words,
                              add_sources=("targets",))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weights_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(
            chars, chars_mask, targets, targets_mask).sum()
        batch_size = chars.shape[1].copy(name="batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        parameters = model.get_parameter_dict()
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in parameters.items()],
                        width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies,) = VariableFilter(
            applications=[generator.readout.readout],
            name_regex="output")(cg.variables)
        (activations,) = VariableFilter(
            applications=[generator.transition.apply],
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = chars.shape[0].copy(name="max_length")
        cost_per_character = aggregation.mean(
            batch_cost, batch_size * max_length).copy(
                name="character_log_likelihood")
        min_energy = energies.min().copy(name="min_energy")
        max_energy = energies.max().copy(name="max_energy")
        mean_activation = abs(activations).mean().copy(
                name="mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation,
            batch_size, max_length, cost_per_character,
            algorithm.total_step_norm, algorithm.total_gradient_norm]
        for name, parameter in parameters.items():
            observables.append(parameter.norm(2).copy(name + "_norm"))
            observables.append(algorithm.gradients[parameter].norm(2).copy(
                name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(
            observables, prefix="average", every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition(["after_batch"], _is_nan),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path, every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_parameter_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    applications=[reverser.generator.generate], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(samples)
                outputs, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            try:
                line = input("Enter a sentence\n")
                message = ("Enter the number of samples\n" if mode == "sample"
                        else "Enter the beam size\n")
                batch_size = int(input(message))
            except EOFError:
                break
            except Exception:
                traceback.print_exc()
                continue

            encoded_input = [char2code.get(char, char2code["<UNK>"])
                             for char in line.lower().strip()]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input,))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size, axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code, level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream, reverse_words,
                              add_sources=("targets",))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weights_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(
            chars, chars_mask, targets, targets_mask).sum()
        batch_size = chars.shape[1].copy(name="batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        parameters = model.get_parameter_dict()
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in parameters.items()],
                        width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(
            cost=cost, parameters=cg.parameters,
            step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies,) = VariableFilter(
            applications=[generator.readout.readout],
            name_regex="output")(cg.variables)
        (activations,) = VariableFilter(
            applications=[generator.transition.apply],
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = chars.shape[0].copy(name="max_length")
        cost_per_character = 
            aggregation.mean(batch_cost, batch_size * max_length).copy(
            name="character_log_likelihood")
        min_energy = energies.min().copy(name="min_energy")
        max_energy = energies.max().copy(name="max_energy")
        mean_activation = abs(activations).mean() .copy(name="mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation,
            batch_size, max_length, cost_per_character,
            algorithm.total_step_norm, algorithm.total_gradient_norm]
        for name, parameter in parameters.items():
            observables.append( parameter.norm(2)
                    .copy(name=name + "_norm"))
            observables.append( algorithm.gradients[parameter].norm(2)
                    .copy(name=name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(
            observables, prefix="average", every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition(["after_batch"], _is_nan),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path, every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)])
        main_loop.run()