Exemple #1
0
def get_test_stream_withContext_withPosTag_grd(test_ctx_datas=None,
                                               test_posTag_datas=None,
                                               test_set_source=None,
                                               test_set_target=None,
                                               src_vocab=None,
                                               src_vocab_size=30000,
                                               trg_vocab=None,
                                               trg_vocab_size=30000,
                                               unk_id=1,
                                               ctx_num=3,
                                               batch_size=80,
                                               **kwargs):
    """Setup development set stream if necessary."""
    dev_stream = None
    if test_set_source is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(src_vocab if isinstance(
            src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=src_vocab_size - 1,
                                           unk_idx=unk_id)
        trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
            trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')),
                                           bos_idx=0,
                                           eos_idx=trg_vocab_size - 1,
                                           unk_idx=unk_id)
        print test_set_source, type(src_vocab)
        # Get text files from both source and target
        ctx_datasets = []
        posTag_datasets = []
        for i in range(ctx_num):
            ctx_datasets.append(TextFile([test_ctx_datas[i]], src_vocab, None))
            posTag_datasets.append(
                TextFile([test_posTag_datas[i]], src_vocab, None))
        posTag_datasets.append(
            TextFile([test_posTag_datas[ctx_num]], src_vocab, None))
        src_dataset = TextFile([test_set_source], src_vocab, None)
        trg_dataset = TextFile([test_set_target], trg_vocab, None)
        # Merge them to get a source, target pair
        dev_stream = Merge([i.get_example_stream() for i in ctx_datasets] +
                           [i.get_example_stream()
                            for i in posTag_datasets] + [
                                src_dataset.get_example_stream(),
                                trg_dataset.get_example_stream()
                            ],
                           tuple('context_' + str(i) for i in range(ctx_num)) +
                           tuple('context_posTag_' + str(i)
                                 for i in range(ctx_num)) +
                           ('source_posTag', 'source', 'target'))

        stream = Mapping(
            dev_stream,
            _oov_to_unk_posTag(ctx_num=ctx_num,
                               src_vocab_size=src_vocab_size,
                               trg_vocab_size=trg_vocab_size,
                               unk_id=unk_id))

        # Build a batched version of stream to read k batches ahead
        stream = Batch(stream,
                       iteration_scheme=ConstantScheme(batch_size=batch_size))
        masked_stream = PaddingWithEOSContext(stream, [
            src_vocab_size - 1
            for i in range(2 * ctx_num + 2), trg_vocab_size - 1
        ])

    return masked_stream
Exemple #2
0
    def get_stream(self,
                   part,
                   batches=True,
                   shuffle=True,
                   add_sources=(),
                   num_examples=None,
                   rng=None,
                   seed=None):
        dataset = self.get_dataset(part, add_sources=add_sources)
        iteration_scheme = None
        if self.use_iteration_scheme:
            if num_examples is None:
                num_examples = dataset.num_examples
            if shuffle:
                iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng)
            else:
                iteration_scheme = SequentialExampleScheme(num_examples)
        stream = DataStream(dataset, iteration_scheme=iteration_scheme)

        # Transformations before rearrangement
        labels_source = self.sources_map['labels']
        if self.add_eos:
            stream = _AddLabel(stream,
                               self.eos_label,
                               which_sources=[labels_source])
        if self.add_bos:
            if self.bos_label is None:
                raise Exception('No bos label given')
            stream = _AddLabel(stream,
                               self.bos_label,
                               append=False,
                               times=self.add_bos,
                               which_sources=[labels_source])
        if self.clip_length:
            stream = _Clip(stream,
                           self.clip_length,
                           force_eos=self.eos_label
                           if self.force_eos_when_clipping else None,
                           which_sources=[labels_source])

        # More efficient packing of examples in batches
        if self.sort_k_batches and batches:
            stream = Batch(stream,
                           iteration_scheme=ConstantScheme(
                               self.batch_size * self.sort_k_batches))
            stream = Mapping(stream, SortMapping(_Length(index=0)))
            stream = Unpack(stream)

        stream = Rearrange(
            stream,
            dict_subset(self.sources_map,
                        self.default_sources + list(add_sources)))

        # Tranformations after rearrangement
        if self.corrupt_sources:
            # Can only corrupt sources with the same alphabet
            # as labels
            for source, prob in zip(self.corrupt_sources['names'],
                                    self.corrupt_sources['probs']):
                stream = _Corrupt(stream,
                                  prob,
                                  self.token_map(source),
                                  self.eos_label,
                                  which_sources=[source])
        if self.max_length and part == 'train':
            # Filtering by the maximum length is only done
            # for the training set.
            self.length_filter = _LengthFilter(indices=[
                i for i, source in enumerate(stream.sources)
                if source in self.filter_by
            ],
                                               max_length=self.max_length)
            stream = Filter(stream, self.length_filter)
        stream = ForceFloatX(stream)

        if not batches:
            return stream

        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(self.batch_size if part == 'train'
                                            else self.validation_batch_size))
        stream = Padding(stream)
        stream = Mapping(stream, switch_first_two_axes)
        stream = ForceCContiguous(stream)
        return stream
Exemple #3
0
 def setUp(self):
     self.stream = Batch(
         DataStream(IterableDataset(range(100))), ConstantScheme(11))
Exemple #4
0
def get_tr_stream(src_vocab,
                  trg_vocab,
                  src_data,
                  trg_data,
                  src_vocab_size=30000,
                  trg_vocab_size=30000,
                  unk_id=1,
                  seq_len=50,
                  batch_size=80,
                  sort_k_batches=12,
                  bos_token=None,
                  **kwargs):
    """Prepares the training data stream."""
    if type(bos_token) is str:
        bos_token = bos_token.decode('utf8')

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else cPickle.load(open(src_vocab)),
                                       bos_idx=0,
                                       eos_idx=src_vocab_size - 1,
                                       unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
        trg_vocab, dict) else cPickle.load(open(trg_vocab)),
                                       bos_idx=0,
                                       eos_idx=trg_vocab_size - 1,
                                       unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data],
                           src_vocab,
                           bos_token=bos_token,
                           eos_token=u'</S>',
                           unk_token=u'<UNK>',
                           encoding='utf8')
    trg_dataset = TextFile([trg_data],
                           trg_vocab,
                           bos_token=bos_token,
                           eos_token=u'</S>',
                           unk_token=u'<UNK>',
                           encoding='utf8')

    # Merge them to get a source, target pair
    stream = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    stream = Mapping(
        stream,
        _oov_to_unk(src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    unk_id=unk_id))

    # Now make a very big batch that we can shuffle
    shuffle_batch_size = kwargs.get('shuffle_batch_size', 1000)
    stream = Batch(stream, iteration_scheme=ConstantScheme(shuffle_batch_size))

    stream = ShuffleBatchTransformer(stream)

    # unpack it again
    stream = Unpack(stream)
    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(stream,
                                   [src_vocab_size - 1, trg_vocab_size - 1])

    return masked_stream, src_vocab, trg_vocab
Exemple #5
0
    data_paths = [
        '/home/andrewsm/SEER/external/CoNLL2003/ner/eng.train',
    ]  # 3.3Mb file

## Achieved result: 50-epochs (GPU) training on eng.train => testb overall scores :
## accuracy:  96.42%; precision:  76.95%; recall:  80.26%; FB1:  78.57

dataset = CoNLLTextFile(data_paths,
                        dictionary=word2code,
                        unknown_token='<UNK>')

data_stream = DataStream(dataset)
data_stream = Filter(data_stream, _filter_long)
#data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",))

data_stream = Batch(data_stream,
                    iteration_scheme=ConstantScheme(mini_batch_size))

#data_stream = Padding(data_stream, mask_sources=('tokens'))            # Adds a mask fields to this stream field, type='floatX'
data_stream = Padding(
    data_stream,
)  # Adds a mask fields to all of this stream's fields, type='floatX'
data_stream = Mapping(
    data_stream, _transpose
)  # Flips stream so that sentences run down columns, batches along rows (strangely)

if False:  # print sample for debugging Dataset / DataStream component
    #t=0
    max_len = 0
    for i, data in enumerate(data_stream.get_epoch_iterator()):
        #print(i)
        #t=t + data[4].sum() + data[0].shape[1]
 def try_strict(strictness):
     return list(
         Batch(stream, ConstantScheme(2),
               strictness=strictness).get_epoch_iterator())
Exemple #7
0
def get_src_trg_stream(cg,
                       config,
                       src_datasets=None,
                       trg_datasets=None,
                       is_training=True,
                       src_vocabs=None,
                       trg_vocabs=None,
                       logprob_datasets=None):
    eid, did = p_(cg)
    if is_training:
        logger.info(' ... src:[{}] - [{}]'.format(eid,
                                                  src_datasets[cg].files[0]))
        logger.info(' ... trg:[{}] - [{}]'.format(did,
                                                  trg_datasets[cg].files[0]))
        stream = Merge([
            src_datasets[cg].get_example_stream(),
            trg_datasets[cg].get_example_stream()
        ], ('source', 'target'))
        stream = Filter(stream, predicate=_too_long(config['seq_len']))

        if 'min_seq_lens' in config and config['min_seq_lens'][cg] > 0:
            stream = Filter(stream,
                            predicate=_too_short(config['min_seq_lens'][cg]))

        stream = Mapping(
            stream,
            _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid],
                        trg_vocab_size=config['trg_vocab_sizes'][did],
                        unk_id=config['unk_id']))
        stream = Batch(
            stream,
            iteration_scheme=ConstantScheme(config['batch_sizes'][cg] *
                                            config['sort_k_batches']))

        stream = Mapping(stream, SortMapping(_length))
        stream = Unpack(stream)
        stream = Batch(stream,
                       iteration_scheme=ConstantScheme(
                           config['batch_sizes'][cg]))
    else:  # logprob stream
        src_dataset = TextFile([logprob_datasets[cg][0]],
                               src_vocabs[p_(cg)[0]], None)
        trg_dataset = TextFile([logprob_datasets[cg][1]],
                               trg_vocabs[p_(cg)[1]], None)
        stream = Merge([
            src_dataset.get_example_stream(),
            trg_dataset.get_example_stream()
        ], ('source', 'target'))
        stream = Mapping(
            stream,
            _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid],
                        trg_vocab_size=config['trg_vocab_sizes'][did],
                        unk_id=config['unk_id']))
        bs = 100
        if 'log_prob_bs' in config:
            if isinstance(config['log_prob_bs'], dict):
                bs = config['log_prob_bs'][cg]
            else:
                bs = config['log_prob_bs']
        stream = Batch(stream, iteration_scheme=ConstantScheme(bs))

    masked_stream = Padding(stream)
    masked_stream = Mapping(
        masked_stream,
        _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]),
                       (2, 0, config['trg_eos_idxs'][did])]))
    return masked_stream
def get_tr_stream_with_context_features(src_vocab, trg_vocab, src_data, trg_data, context_features,
                                        src_vocab_size=30000, trg_vocab_size=30000, unk_id=1,
                                        seq_len=50, batch_size=80, sort_k_batches=12, **kwargs):
    """Prepares the training data stream."""

    def _get_np_array(filename):
        return numpy.load(filename)['arr_0']

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge([src_dataset.get_example_stream(),
                    trg_dataset.get_example_stream()],
                   ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(seq_len=seq_len))


    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))

    # now add the source with the image features
    # create the image datastream (iterate over a file line-by-line)
    train_features = _get_np_array(context_features)
    train_feature_dataset = IterableDataset(train_features)
    train_image_stream = DataStream(train_feature_dataset)

    stream = Merge([stream, train_image_stream], ('source', 'target', 'initial_context'))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(
                       batch_size*sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(
        stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream, [src_vocab_size - 1, trg_vocab_size - 1], mask_sources=('source', 'target'))

    return masked_stream, src_vocab, trg_vocab
Exemple #9
0
def get_stream(source,
               target,
               source_input_dict,
               target_label_dict,
               batch_size,
               buffer_multiplier=100,
               input_token_level='word',
               n_input_tokens=0,
               n_labels=0,
               reverse_labels=False,
               max_input_length=None,
               max_label_length=None,
               pad_labels=True,
               is_sort=True):
    """Returns a stream over sentence pairs.

    Parameters
    ----------
    source : list
        A list of files to read source languages from.
    target : list
        A list of corresponding files in the target language.
    source_word_dict : str
        Path to a tab-delimited text file whose last column contains the
        vocabulary.
    target_label_dict : str
        See `source_char_dict`.
    batch_size : int
        The minibatch size.
    buffer_multiplier : int
        The number of batches to load, concatenate, sort by length of
        source sentence, and split again; this makes batches more uniform
        in their sentence length and hence more computationally efficient.
    n_source_words : int
        The number of words in the source vocabulary. Pass 0 (default) to
        use the entire vocabulary.
    n_target_labels : int
        See `n_chars_source`.

    """
    if len(source) != len(target):
        raise ValueError("number of source and target files don't match")

    # Read the dictionaries
    dicts = [
        load_dict(source_input_dict, dict_size=n_input_tokens),
        load_dict(target_label_dict,
                  dict_size=n_labels,
                  reverse=reverse_labels,
                  include_unk=False)
    ]

    # Open the two sets of files and merge them
    streams = [
        TextFile(source,
                 dicts[0],
                 level=input_token_level,
                 bos_token=None,
                 eos_token=EOS_TOKEN,
                 encoding='utf-8').get_example_stream(),
        TextFile(target,
                 dicts[1],
                 level='word',
                 bos_token=None,
                 unk_token=None,
                 eos_token=EOS_TOKEN,
                 encoding='utf-8').get_example_stream()
    ]
    merged = Merge(streams, ('source_input_tokens', 'target_labels'))
    if reverse_labels:
        merged = SortLabels(merged)

    # Filter sentence lengths
    if max_input_length or max_label_length:

        def filter_pair(pair):
            src_input_tokens, trg_labels = pair
            src_input_ok = (not max_input_length) or \
                len(src_input_tokens) <= (max_input_length + 1)
            trg_label_ok = (not max_label_length) or \
                len(trg_labels) <= (max_label_length + 1)

            return src_input_ok and trg_label_ok

        merged = Filter(merged, filter_pair)

    # Batches of approximately uniform size
    large_batches = Batch(merged,
                          iteration_scheme=ConstantScheme(batch_size *
                                                          buffer_multiplier))
    # sorted_batches = Mapping(large_batches, SortMapping(_source_length))
    # batches = Cache(sorted_batches, ConstantScheme(batch_size))
    # shuffled_batches = Shuffle(batches, buffer_multiplier)
    # masked_batches = Padding(shuffled_batches,
    #                          mask_sources=('source_chars', 'target_labels'))
    if is_sort:
        sorted_batches = Mapping(large_batches, SortMapping(_source_length))
    else:
        sorted_batches = large_batches
    batches = Cache(sorted_batches, ConstantScheme(batch_size))
    mask_sources = ('source_input_tokens', 'target_labels')
    masked_batches = Padding(batches, mask_sources=mask_sources)

    return masked_batches
def get_tr_stream(src_vocab,
                  trg_vocab,
                  src_data,
                  trg_data,
                  dict_data,
                  src_vocab_size=30000,
                  trg_vocab_size=30000,
                  unk_id=1,
                  seq_len=50,
                  batch_size=80,
                  sort_k_batches=12,
                  **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    '''
    actual_src_vocab_num = len(src_vocab)
    actual_trg_vocab_num = len(trg_vocab)
    src_vocab = ensure_special_tokens(
        src_vocab if isinstance(src_vocab, dict)
        else cPickle.load(open(src_vocab)),
        bos_idx=0, eos_idx=(actual_src_vocab_num - 1) if
	actual_src_vocab_num - 3 <
	src_vocab_size else (src_vocab_size + 3 -
		1), unk_idx=unk_id)
    trg_vocab = ensure_special_tokens(
        trg_vocab if isinstance(trg_vocab, dict) else
        cPickle.load(open(trg_vocab)),
        bos_idx=0, eos_idx=(actual_trg_vocab_num - 1) if
	actual_trg_vocab_num - 3 < trg_vocab_size else
	(trg_vocab_size + 3 - 1), unk_idx=unk_id)
    '''

    src_vocab = ensure_special_tokens(src_vocab if isinstance(src_vocab, dict)
                                      else cPickle.load(open(src_vocab)),
                                      bos_idx=0,
                                      eos_idx=src_vocab_size - 1,
                                      unk_idx=unk_id)
    trg_vocab = ensure_special_tokens(trg_vocab if isinstance(trg_vocab, dict)
                                      else cPickle.load(open(trg_vocab)),
                                      bos_idx=0,
                                      eos_idx=trg_vocab_size - 1,
                                      unk_idx=unk_id)

    # for example:
    # source: 第五 章 罚则
    # target: chapter v penalty regulations
    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab)
    trg_dataset = TextFile([trg_data], trg_vocab)
    dict_dataset = TextFile([dict_data], trg_vocab)
    # for data in DataStream(src_dataset).get_epoch_iterator():
    #    print(data)     # looks like: ([0, 1649, 1764, 7458, 29999],)

    # Merge them to get a source, target pair
    stream = Merge([
        src_dataset.get_example_stream(),
        trg_dataset.get_example_stream(),
        dict_dataset.get_example_stream()
    ], ('source', 'target',
        'dict'))  # data_stream.sources = 'source' or 'target'
    '''
    print 'init \n'
    num_before_filter = 0
    for data in stream.get_epoch_iterator():
        num_before_filter = num_before_filter + 1
        # print(data)
    '''
    # looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999])

    # Filter sequences that are too long
    # Neither source sentence or target sentence can beyond the length seq_len
    # the lenght include the start symbol <s> and the end symbol </s>, so the actual sentence
    # length can not beyond (seq_len - 2)
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))
    '''
    num_after_filter = 0
    # print 'after filter ... \n'
    for data in stream.get_epoch_iterator():
        num_after_filter = num_after_filter + 1
        # print(data)

    logger.info('\tby filtering, sentence-pairs from {} to {}.'.format(num_before_filter, num_after_filter))
    logger.info('\tfilter {} sentence-pairs whose source or target sentence exceeds {} words'.format(
        (num_before_filter - num_after_filter), seq_len))
    '''

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(stream,
                     _oov_to_unk(src_vocab_size=src_vocab_size,
                                 trg_vocab_size=trg_vocab_size,
                                 unk_id=unk_id))  # do not need
    '''
    print 'after mapping unk ...'
    for data in stream.get_epoch_iterator():
        print(data)
    '''

    # still looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999])
    # Build a batched version of stream to read k batches ahead
    # do not sort on the whole training data, first split the training data into several blocks,
    # each block contain (batch_size*sort_k_batches) sentence-pairs, we juse sort in each block,
    # finally, i understand !!!!!!!
    # remainder
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))
    '''
    print 'after sorted batch ... '
    for data in stream.get_epoch_iterator():
        print(data)
    '''

    # Sort all samples in the read-ahead batch
    # sort by the length of target sentence in (batch_size*sort_k_batches)
    # list for all training data, speed up
    stream = Mapping(stream, SortMapping(_length))
    '''
    print 'after sort ... '
    for data in stream.get_epoch_iterator():
        print(data)
    '''

    # Convert it into a stream again
    stream = Unpack(stream)
    '''
    print 'after unpack ... '
    for data in stream.get_epoch_iterator():
        print(data)
    '''
    # still looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999])

    # remove the remainder ?
    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # after sort, each batch has batch_size sentence pairs
    '''
    print 'after final batch ... '
    i = 0
    for data in stream.get_epoch_iterator():
        i = i + 1
        print(data)
    print 'batchs: ', i
    '''

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream,
        bos_idx=[0, 0, 0],
        eos_idx=[src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1])
    # print 'after padding with mask ...'
    return masked_stream
Exemple #11
0
 def test_adds_batch_to_axis_labels(self):
     stream = DataStream(
         IterableDataset({'features': [1, 2, 3, 4, 5]},
                         axis_labels={'features': ('index', )}))
     transformer = Batch(stream, ConstantScheme(2), strictness=0)
     assert_equal(transformer.axis_labels, {'features': ('batch', 'index')})
Exemple #12
0
 def get_stream(self,
                part,
                batch_size=None,
                shuffle=False,
                max_length=None,
                raw_text=False,
                q_ids=False,
                seed=None,
                dataset=None):
     if not seed:
         seed = fuel.config.default_seed
     rng = numpy.random.RandomState(seed)
     if not dataset:
         dataset = self.get_dataset(part)
     if shuffle:
         stream = DataStream(dataset,
                             iteration_scheme=ShuffledExampleScheme(
                                 dataset.num_examples, rng=rng))
     else:
         stream = dataset.get_example_stream()
     if not q_ids:
         stream = FilterSources(
             stream,
             [source for source in dataset.sources if source != 'q_ids'])
     else:
         stream = SourcewiseMapping(stream,
                                    _str2vec,
                                    which_sources=('q_ids'))
     stream = PutTextTransfomer(stream, dataset, raw_text=True)
     # <eos> is added for two purposes: to serve a sentinel for coattention,
     # and also to ensure the answer span ends at a token
     eos = self.vocab.EOS
     stream = SourcewiseMapping(stream,
                                functools.partial(add_eos, eos),
                                which_sources=('contexts', 'questions'))
     stream = Mapping(stream,
                      functools.partial(select_random_answer, rng),
                      mapping_accepts=dict)
     if not batch_size:
         if self._retrieval:
             raise NotImplementedError()
         return stream
     if raw_text:
         stream = Mapping(stream,
                          keep_text,
                          mapping_accepts=dict,
                          add_sources=('contexts_text', 'questions_text'))
     stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))
     if self._retrieval:
         stream = Mapping(stream,
                          functools.partial(retrieve_and_pad_squad,
                                            self._retrieval),
                          mapping_accepts=dict,
                          add_sources=('defs', 'def_mask',
                                       'contexts_def_map',
                                       'questions_def_map'))
     stream = SourcewiseMapping(stream,
                                functools.partial(digitize, self.vocab),
                                which_sources=('contexts', 'questions'))
     stream = Padding(stream,
                      mask_sources=['contexts', 'questions'] +
                      (['contexts_text'] if raw_text else []))
     return stream
def get_stream(source,
               target,
               source_dict,
               target_dict,
               batch_size,
               buffer_multiplier=100,
               n_words_source=0,
               n_words_target=0,
               max_src_length=None,
               max_trg_length=None):
    """Returns a stream over sentence pairs.

    Parameters
    ----------
    source : list
        A list of files to read source languages from.
    target : list
        A list of corresponding files in the target language.
    source_dict : str
        Path to a tab-delimited text file whose last column contains the
        vocabulary.
    target_dict : str
        See `source_dict`.
    batch_size : int
        The minibatch size.
    buffer_multiplier : int
        The number of batches to load, concatenate, sort by length of
        source sentence, and split again; this makes batches more uniform
        in their sentence length and hence more computationally efficient.
    n_words_source : int
        The number of words in the source vocabulary. Pass 0 (default) to
        use the entire vocabulary.
    n_words_target : int
        See `n_words_source`.

    """
    if len(source) != len(target):
        raise ValueError("number of source and target files don't match")

    # Read the dictionaries
    dicts = [
        load_dict(source_dict, n_words=n_words_source),
        load_dict(target_dict, n_words=n_words_target)
    ]

    # Open the two sets of files and merge them
    streams = [
        TextFile(source, dicts[0], bos_token=None,
                 eos_token=EOS_TOKEN).get_example_stream(),
        TextFile(target, dicts[1], bos_token=None,
                 eos_token=EOS_TOKEN).get_example_stream()
    ]
    merged = Merge(streams, ('source', 'target'))

    # Filter sentence lengths
    if max_src_length or max_trg_length:

        def filter_pair(pair):
            src, trg = pair
            src_ok = (not max_src_length) or len(src) < max_src_length
            trg_ok = (not max_trg_length) or len(trg) < max_trg_length
            return src_ok and trg_ok

        merged = Filter(merged, filter_pair)

    # Batches of approximately uniform size
    large_batches = Batch(merged,
                          iteration_scheme=ConstantScheme(batch_size *
                                                          buffer_multiplier))
    sorted_batches = Mapping(large_batches, SortMapping(_source_length))
    batches = Cache(sorted_batches, ConstantScheme(batch_size))
    shuffled_batches = Shuffle(batches, buffer_multiplier)
    masked_batches = Padding(shuffled_batches)

    return masked_batches
Exemple #14
0
        logger.info('Saving the main loop...')
        dump_manager = MainLoopDumpManager(save_location)
        dump_manager.dump(main_loop)
        logger.info('Saved')


if __name__ == "__main__":
    # Test
    cost = construct_model(50000, 256, 100, Tanh())
    vocabulary = get_vocabulary(50000)
    rare, freq = frequencies(vocabulary, 2000, 100)

    # Build training and validation datasets
    train_stream = Padding(
        Batch(Mapping(get_sentence_stream('training', [1], vocabulary),
                      add_frequency_all,
                      add_sources=("frequency_mask", )),
              iteration_scheme=ConstantScheme(64)))

    valid_stream = Padding(
        Batch(Mapping(get_sentence_stream('heldout', [1], vocabulary),
                      add_frequency_all,
                      add_sources=("frequency_mask", )),
              iteration_scheme=ConstantScheme(256)))

    valid_freq = Padding(
        Batch(Mapping(get_sentence_stream('heldout', [1], vocabulary),
                      add_frequency_mask(freq),
                      add_sources=("frequency_mask", )),
              iteration_scheme=ConstantScheme(256)))

    valid_rare = Padding(
Exemple #15
0
def _get_vl_stream(src_vocab,
                   trg_vocab,
                   src_files,
                   trg_files_list,
                   encoding='UTF-8',
                   preprocess=to_lower_case,
                   src_vocab_size=30000,
                   trg_vocab_size=30000,
                   eos='</S>',
                   eos_id=0,
                   unk='<UNK>',
                   unk_id=1,
                   batch_size=80,
                   sort_k_batches=12,
                   **kwargs):
    """Prepares the validation/test data stream."""
    src_dataset = TextFile(src_files,
                           src_vocab,
                           preprocess=preprocess,
                           bos_token=None,
                           eos_token=eos,
                           unk_token=unk,
                           encoding=encoding)

    trg_dataset_list = [
        TextFile(trg_files,
                 trg_vocab,
                 preprocess=preprocess,
                 bos_token=None,
                 eos_token=None,
                 unk_token=unk,
                 encoding=encoding) for trg_files in trg_files_list
    ]

    src_data_stream = DataStream(src_dataset)
    trg_data_stream_list = [
        DataStream(trg_dataset) for trg_dataset in trg_dataset_list
    ]

    # Replace out of vocabulary tokens with unk token
    if src_vocab_size < len(src_vocab):
        src_data_stream = Mapping(
            src_data_stream,
            _oov_to_unk(vocab_size=src_vocab_size, unk_id=unk_id))

    if trg_vocab_size < len(trg_vocab):
        trg_data_stream_list = [
            Mapping(trg_data_stream,
                    _oov_to_unk(vocab_size=trg_vocab_size, unk_id=unk_id))
            for trg_data_stream in trg_data_stream_list
        ]

    # Merge them to get a source, multiple references
    stream = Merge(
        [src_data_stream] + trg_data_stream_list, ('source', ) +
        tuple(['reference_%d' % i for i in range(len(trg_data_stream_list))]))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length(target_source_index=0)))

    # Convert it into a stream again
    stream = Unpack(stream)

    # TODO: create dynamic batches, larger  batch size for shorter sentences, while smaller for longer sentence

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    stream = _PaddingWithToken(stream, eos_id, mask_sources=('source', ))

    return stream
Exemple #16
0
            [x if x < trg_vocab_size else unk_id for x in sentence_pair[1]])


def too_long(sentence_pair):
    return all([len(sentence) < config['seq_len'] for sentence in sentence_pair])


class CycleTextFile(TextFile):
    """This dataset cycles through the text files, reading a sentence
    from each.
    """
    def open(self):
        return chain.from_iterable(izip(*[chain.from_iterable(
            imap(open, repeat(f))) for f in self.files]))

en_dataset = CycleTextFile(en_files, cPickle.load(open(en_vocab)), None)
fr_dataset = CycleTextFile(fr_files, cPickle.load(open(fr_vocab)), None)

stream = Merge([en_dataset.get_example_stream(),
                fr_dataset.get_example_stream()],
               ('english', 'french'))

dev_dataset = TextFile([dev_file], cPickle.load(open(en_vocab)), None)
dev_stream = DataStream(dev_dataset)

filtered_stream = Filter(stream, predicate=too_long)
filtered_stream = Mapping(filtered_stream, _oov_to_unk)
batched_stream = Batch(filtered_stream,
        iteration_scheme=ConstantScheme(config['batch_size']))
masked_stream = Padding(batched_stream)
Exemple #17
0
        lang=exp_config['target_lang'],
        meteor_directory=exp_config['meteor_directory'])
# BLEU
else:
    sampling_transformer = MMMTSampleStreamTransformer(
        sampling_func,
        sentence_level_bleu,
        num_samples=exp_config['n_samples'])

training_stream = Mapping(training_stream,
                          sampling_transformer,
                          add_sources=('samples', 'scores'))

# Build a batched version of stream to read k batches ahead
training_stream = Batch(
    training_stream,
    iteration_scheme=ConstantScheme(exp_config['batch_size'] *
                                    exp_config['sort_k_batches']))

# TODO: add read-ahead shuffling Mapping similar to SortMapping
# Sort all samples in the read-ahead batch
training_stream = Mapping(training_stream, SortMapping(_length))

# Convert it into a stream again
training_stream = Unpack(training_stream)

# Construct batches from the stream with specified batch size
training_stream = Batch(training_stream,
                        iteration_scheme=ConstantScheme(
                            exp_config['batch_size']))

# Pad sequences that are short
Exemple #18
0
def main(mode, save_path, num_batches, data_path=None):
    reverser = WordReverser(100, len(char2code), name="reverser")

    if mode == "train":
        # Data processing pipeline
        dataset_options = dict(dictionary=char2code, level="character",
                               preprocess=_lower)
        if data_path:
            dataset = TextFile(data_path, **dataset_options)
        else:
            dataset = OneBillionWord("training", [99], **dataset_options)
        data_stream = dataset.get_example_stream()
        data_stream = Filter(data_stream, _filter_long)
        data_stream = Mapping(data_stream, reverse_words,
                              add_sources=("targets",))
        data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))
        data_stream = Padding(data_stream)
        data_stream = Mapping(data_stream, _transpose)

        # Initialization settings
        reverser.weights_init = IsotropicGaussian(0.1)
        reverser.biases_init = Constant(0.0)
        reverser.push_initialization_config()
        reverser.encoder.weghts_init = Orthogonal()
        reverser.generator.transition.weights_init = Orthogonal()

        # Build the cost computation graph
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")
        batch_cost = reverser.cost(
            chars, chars_mask, targets, targets_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost,  batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Give an idea of what's going on
        model = Model(cost)
        params = model.get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat(
                        [(key, value.get_value().shape) for key, value
                         in params.items()],
                        width=120))

        # Initialize parameters
        for brick in model.get_top_bricks():
            brick.initialize()

        # Define the training algorithm.
        cg = ComputationGraph(cost)
        algorithm = GradientDescent(
            cost=cost, params=cg.parameters,
            step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]))

        # Fetch variables useful for debugging
        generator = reverser.generator
        (energies,) = VariableFilter(
            application=generator.readout.readout,
            name="output")(cg.variables)
        (activations,) = VariableFilter(
            application=generator.transition.apply,
            name=generator.transition.apply.states[0])(cg.variables)
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        mean_activation = named_copy(abs(activations).mean(),
                                     "mean_activation")
        observables = [
            cost, min_energy, max_energy, mean_activation,
            batch_size, max_length, cost_per_character,
            algorithm.total_step_norm, algorithm.total_gradient_norm]
        for name, param in params.items():
            observables.append(named_copy(
                param.norm(2), name + "_norm"))
            observables.append(named_copy(
                algorithm.gradients[param].norm(2), name + "_grad_norm"))

        # Construct the main loop and start training!
        average_monitoring = TrainingDataMonitoring(
            observables, prefix="average", every_n_batches=10)
        main_loop = MainLoop(
            model=model,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=[
                Timing(),
                TrainingDataMonitoring(observables, after_batch=True),
                average_monitoring,
                FinishAfter(after_n_batches=num_batches)
                # This shows a way to handle NaN emerging during
                # training: simply finish it.
                .add_condition("after_batch", _is_nan),
                Plot(os.path.basename(save_path),
                     [[average_monitoring.record_name(cost)],
                      [average_monitoring.record_name(cost_per_character)]],
                     every_n_batches=10),
                # Saving the model and the log separately is convenient,
                # because loading the whole pickle takes quite some time.
                Checkpoint(save_path, every_n_batches=500,
                           save_separately=["model", "log"]),
                Printing(every_n_batches=1)])
        main_loop.run()
    elif mode == "sample" or mode == "beam_search":
        chars = tensor.lmatrix("input")
        generated = reverser.generate(chars)
        model = Model(generated)
        logger.info("Loading the model..")
        model.set_param_values(load_parameter_values(save_path))

        def generate(input_):
            """Generate output sequences for an input sequence.

            Incapsulates most of the difference between sampling and beam
            search.

            Returns
            -------
            outputs : list of lists
                Trimmed output sequences.
            costs : list
                The negative log-likelihood of generating the respective
                sequences.

            """
            if mode == "beam_search":
                samples, = VariableFilter(
                    bricks=[reverser.generator], name="outputs")(
                        ComputationGraph(generated[1]))
                # NOTE: this will recompile beam search functions
                # every time user presses Enter. Do not create
                # a new `BeamSearch` object every time if
                # speed is important for you.
                beam_search = BeamSearch(input_.shape[1], samples)
                outputs, costs = beam_search.search(
                    {chars: input_}, char2code['</S>'],
                    3 * input_.shape[0])
            else:
                _1, outputs, _2, _3, costs = (
                    model.get_theano_function()(input_))
                outputs = list(outputs.T)
                costs = list(costs.T)
                for i in range(len(outputs)):
                    outputs[i] = list(outputs[i])
                    try:
                        true_length = outputs[i].index(char2code['</S>']) + 1
                    except ValueError:
                        true_length = len(outputs[i])
                    outputs[i] = outputs[i][:true_length]
                    costs[i] = costs[i][:true_length].sum()
            return outputs, costs

        while True:
            line = input("Enter a sentence\n")
            message = ("Enter the number of samples\n" if mode == "sample"
                       else "Enter the beam size\n")
            batch_size = int(input(message))

            encoded_input = [char2code.get(char, char2code["<UNK>"])
                             for char in line.lower().strip()]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input,))[0]
            print("Target: ", target)

            samples, costs = generate(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size, axis=1))
            messages = []
            for sample, cost in equizip(samples, costs):
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=operator.itemgetter(0), reverse=True)
            for _, message in messages:
                print(message)
Exemple #19
0
def get_tr_stream_predict(src_vocab,
                          src_data,
                          trg_data,
                          src_vocab_size=30000,
                          trg_vocab_size=30000,
                          unk_id=1,
                          seq_len=50,
                          batch_size=80,
                          sort_k_batches=12,
                          **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else cPickle.load(open(src_vocab)),
                                       bos_idx=0,
                                       eos_idx=2,
                                       unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, preprocess=get_unicode)
    trg_dataset = TextFile([trg_data], src_vocab, preprocess=get_unicode)
    #src_dataset = TextFile([src_data], src_vocab, None)
    #trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))
    # TODO thius was from today<
    #print(type(src_dataset.get_example_stream()))
    #print(type(src_dataset))
    #print(list(src_dataset.get_example_stream().get_epoch_iterator()))
    #sys.exit(0)

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(
        stream,
        _oov_to_unk(src_vocab_size=src_vocab_size,
                    trg_vocab_size=src_vocab_size,
                    unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(stream,
                                   [src_vocab_size - 1, src_vocab_size - 1])

    return masked_stream
Exemple #20
0
def get_tr_stream(src_vocab,
                  trg_vocab,
                  src_data,
                  trg_data,
                  src_vocab_size=120,
                  trg_vocab_size=120,
                  unk_id=1,
                  bos_token='<S>',
                  seq_char_len=300,
                  seq_word_len=50,
                  batch_size=70,
                  sort_k_batches=12,
                  **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else pickle.load(open(src_vocab, 'rb')),
                                       bos_idx=0,
                                       eos_idx=src_vocab_size - 1,
                                       unk_idx=unk_id)

    trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
        trg_vocab, dict) else pickle.load(open(trg_vocab, 'rb')),
                                       bos_idx=0,
                                       eos_idx=trg_vocab_size - 1,
                                       unk_idx=unk_id)

    # Get text files from both source and target
    src_dataset = TextFileWithSEOSS([src_data],
                                    src_vocab,
                                    None,
                                    level='character')
    trg_dataset = TextFileWithSEOSS([trg_data],
                                    trg_vocab,
                                    None,
                                    level='character')

    # Merge them to get a source, target pair
    stream = Merge(
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()], ('source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream,
                    predicate=_too_long(
                        unk_id=unk_id,
                        space_idx=[src_vocab[' '], trg_vocab[' ']],
                        seq_char_len=seq_char_len,
                        seq_word_len=seq_word_len))

    # Replace out of vocabulary tokens with unk token
    stream = Mapping(
        stream,
        _oov_to_unk(src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)
    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(stream, {
        'source': src_vocab[' '],
        'target': trg_vocab[' ']
    },
                                   trg_vocab[bos_token],
                                   mask_dtype='int8')

    return masked_stream
Exemple #21
0
def get_logprob_streams(config):
    if 'log_prob_sets' not in config:
        return None

    cgs = config['cgs']
    enc_ids, dec_ids = get_enc_dec_ids(cgs)
    datasets = config['log_prob_sets']

    # Prepare source vocabs and files, make sure special tokens are there
    src_vocabs = {
        k: cPickle.load(open(v))
        for k, v in config['src_vocabs'].iteritems()
    }
    for k in src_vocabs.keys():
        src_vocabs[k]['<S>'] = 0
        src_vocabs[k]['</S>'] = config['src_eos_idxs'][k]
        src_vocabs[k]['<UNK>'] = config['unk_id']

    # Prepare target vocabs and files, make sure special tokens are there
    trg_vocabs = {
        k: cPickle.load(open(v))
        for k, v in config['trg_vocabs'].iteritems()
    }
    for k in trg_vocabs.keys():
        trg_vocabs[k]['<S>'] = 0
        trg_vocabs[k]['</S>'] = config['trg_eos_idxs'][k]
        trg_vocabs[k]['<UNK>'] = config['unk_id']

    # Build the preprocessing pipeline for individual streams
    ind_streams = {}
    for cg in cgs:
        eid, did = p_(cg)
        if cg not in datasets:
            continue
        logger.info('Building logprob stream for cg:[{}]'.format(cg))
        src_dataset = TextFile([datasets[cg][0]], src_vocabs[p_(cg)[0]], None)
        trg_dataset = TextFile([datasets[cg][1]], trg_vocabs[p_(cg)[1]], None)
        stream = Merge([
            src_dataset.get_example_stream(),
            trg_dataset.get_example_stream()
        ], ('source', 'target'))

        stream = Mapping(
            stream,
            _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid],
                        trg_vocab_size=config['trg_vocab_sizes'][did],
                        unk_id=config['unk_id']))
        bs = 100
        if 'log_prob_bs' in config:
            if isinstance(config['log_prob_bs'], dict):
                bs = config['log_prob_bs'][cg]
            else:
                bs = config['log_prob_bs']

        stream = Batch(stream, iteration_scheme=ConstantScheme(bs))

        masked_stream = Padding(stream)
        masked_stream = Mapping(
            masked_stream,
            _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]),
                           (2, 0, config['trg_eos_idxs'][did])]))
        ind_streams[cg] = masked_stream

    return ind_streams
 def test_strictness_2_error(self):
     stream = DataStream(IterableDataset([1, 2, 3, 4, 5]))
     transformer = Batch(stream, ConstantScheme(2), strictness=2)
     assert_raises(ValueError, list, transformer.get_epoch_iterator())
Exemple #23
0
def get_tr_stream_with_topicalq(src_vocab,
                                trg_vocab,
                                topical_vocab,
                                src_data,
                                trg_data,
                                topical_data,
                                src_vocab_size=30000,
                                trg_vocab_size=30000,
                                topical_vocab_size=2000,
                                unk_id=1,
                                seq_len=50,
                                batch_size=80,
                                sort_k_batches=12,
                                **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist

    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                       bos_idx=0,
                                       eos_idx=src_vocab_size - 1,
                                       unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
        trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')),
                                       bos_idx=0,
                                       eos_idx=trg_vocab_size - 1,
                                       unk_idx=unk_id)
    topical_vocab = cPickle.load(open(topical_vocab, 'rb'))
    #not ensure special token.

    # Get text files from both source and target
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)
    topical_dataset = TextFile([topical_data], topical_vocab, None, None, '10')

    # Merge them to get a source, target pair
    stream = Merge([
        src_dataset.get_example_stream(),
        trg_dataset.get_example_stream(),
        topical_dataset.get_example_stream()
    ], ('source', 'target', 'source_topical'))

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    # Replace out of vocabulary tokens with unk token
    # The topical part are not contained of it, check~
    stream = Mapping(
        stream,
        _oov_to_unk(src_vocab_size=src_vocab_size,
                    trg_vocab_size=trg_vocab_size,
                    topical_vocab_size=topical_vocab_size,
                    unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOS(
        stream,
        [src_vocab_size - 1, trg_vocab_size - 1, topical_vocab_size - 1])

    return masked_stream
 def test_value_error_on_request_none(self):
     stream = DataStream(IterableDataset([1, 2, 3, 4, 5]))
     transformer = Batch(stream, ConstantScheme(2))
     assert_raises(ValueError, transformer.get_data, None)
Exemple #25
0
def setup_model_and_stream(exp_config, source_vocab, target_vocab):

    # TODO: this line is a mess
    sample_model, theano_sampling_source_input, theano_sampling_context_input, train_encoder, \
    train_decoder, generated = \
        get_sampling_model_and_input(exp_config)

    trg_vocab = target_vocab
    trg_vocab_size = exp_config['trg_vocab_size']
    src_vocab = source_vocab
    src_vocab_size = exp_config['src_vocab_size']

    theano_sample_func = sample_model.get_theano_function()
    sampling_func = SampleFunc(theano_sample_func, trg_vocab)

    # TODO: move stream creation to nn_imt.stream
    # def get_textfile_stream(source_file=None, src_vocab=None, src_vocab_size=30000,
    #                         unk_id=1, bos_token=None):
    src_stream = get_textfile_stream(
        source_file=exp_config['src_data'],
        src_vocab=exp_config['src_vocab'],
        src_vocab_size=exp_config['src_vocab_size'],
        unk_id=exp_config['unk_id'],
        bos_token='<S>')

    trg_stream = get_textfile_stream(
        source_file=exp_config['trg_data'],
        src_vocab=exp_config['trg_vocab'],
        src_vocab_size=exp_config['trg_vocab_size'],
        unk_id=exp_config['unk_id'],
        bos_token='<S>')

    # text file stream
    training_stream = Merge([src_stream, trg_stream], ('source', 'target'))

    # Filter sequences that are too long (Note this may break)
    training_stream = Filter(
        training_stream, predicate=_too_long(seq_len=exp_config['seq_len']))

    # Replace out of vocabulary tokens with unk token
    # TODO: doesn't the TextFile stream do this anyway?
    training_stream = Mapping(
        training_stream,
        _oov_to_unk(src_vocab_size=exp_config['src_vocab_size'],
                    trg_vocab_size=exp_config['trg_vocab_size'],
                    unk_id=exp_config['unk_id']))

    # add in the prefix and suffix seqs
    # working: add the sample ratio
    logger.info('Sample ratio is: {}'.format(exp_config.get(
        'sample_ratio', 1.)))
    training_stream = Mapping(
        training_stream,
        PrefixSuffixStreamTransformer(
            sample_ratio=exp_config.get('sample_ratio', 1.)),
        add_sources=('target_prefix', 'target_suffix'))

    training_stream = Mapping(
        training_stream, CopySourceAndTargetToMatchPrefixes(training_stream))

    # changing stream.produces_examples is a little hack which lets us use Unpack to flatten
    training_stream.produces_examples = False

    # flatten the stream back out into (source, target, target_prefix, target_suffix)
    training_stream = Unpack(training_stream)

    # METEOR
    trg_ivocab = {v: k for k, v in trg_vocab.items()}

    # TODO: Implement smoothed BLEU
    # TODO: Implement first-word accuracy (bilingual language model)

    min_risk_score_func = exp_config.get('min_risk_score_func', 'bleu')

    if min_risk_score_func == 'meteor':
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_meteor,
            num_samples=exp_config['n_samples'],
            trg_ivocab=trg_ivocab,
            lang=exp_config['target_lang'],
            meteor_directory=exp_config['meteor_directory'])
    elif min_risk_score_func == 'imt_f1':
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_imt_f1,
            num_samples=exp_config['n_samples'])
    # BLEU is default
    else:
        sampling_transformer = IMTSampleStreamTransformer(
            sampling_func,
            sentence_level_bleu,
            num_samples=exp_config['n_samples'])

    training_stream = Mapping(training_stream,
                              sampling_transformer,
                              add_sources=('samples', 'seq_probs', 'scores'))

    # now filter out segments whose samples are too good or too bad
    training_stream = Filter(training_stream, predicate=filter_by_sample_score)

    # Now make a very big batch that we can shuffle
    # Build a batched version of stream to read k batches ahead
    shuffle_batch_size = exp_config['shuffle_batch_size']
    training_stream = Batch(
        training_stream, iteration_scheme=ConstantScheme(shuffle_batch_size))

    training_stream = ShuffleBatchTransformer(training_stream)

    # unpack it again
    training_stream = Unpack(training_stream)

    # Build a batched version of stream to read k batches ahead
    batch_size = exp_config['batch_size']
    sort_k_batches = exp_config['sort_k_batches']
    training_stream = Batch(training_stream,
                            iteration_scheme=ConstantScheme(batch_size *
                                                            sort_k_batches))

    # Sort all samples in the read-ahead batch
    training_stream = Mapping(training_stream, SortMapping(_length))

    # Convert it into a stream again
    training_stream = Unpack(training_stream)

    # Construct batches from the stream with specified batch size
    training_stream = Batch(training_stream,
                            iteration_scheme=ConstantScheme(batch_size))

    # IDEA: add a transformer which flattens the target samples before we add the mask
    flat_sample_stream = FlattenSamples(training_stream)

    expanded_source_stream = CopySourceAndPrefixNTimes(
        flat_sample_stream, n_samples=exp_config['n_samples'])

    # Pad sequences that are short
    # TODO: is it correct to blindly pad the target_prefix and the target_suffix?
    # Note: we shouldn't need to pad the seq_probs because there is only one per sequence
    # TODO: DEVELOPMENT HACK
    exp_config['suffix_length'] = 1
    exp_config['truncate_sources'] = ['target_suffix']
    configurable_padding_args = {
        'suffix_length': exp_config.get('suffix_length', None),
        'truncate_sources': exp_config.get('truncate_sources', [])
    }
    import ipdb
    ipdb.set_trace()
    masked_stream = PaddingWithEOS(expanded_source_stream, [
        src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1,
        trg_vocab_size - 1, trg_vocab_size - 1
    ],
                                   mask_sources=('source', 'target',
                                                 'target_prefix',
                                                 'target_suffix', 'samples'),
                                   **configurable_padding_args)

    return train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream
 def test_2d_sequences_error_on_unequal_shapes(self):
     stream = Batch(
         DataStream(
             IterableDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 3))])),
         ConstantScheme(2))
     assert_raises(ValueError, next, Padding(stream).get_epoch_iterator())
Exemple #27
0
m = MorphGen(100, len(Globals.char2code))

dataset_options = dict(dictionary=Globals.char2code,
                       level="word",
                       preprocess=_tokenise)

dataset = TextFile([f_train], **dataset_options)

data_stream = dataset.get_example_stream()

# Read examples and look up the right surface form
data_stream = Mapping(data_stream, morph_lookup, add_sources=("targets", ))

# Read in 10 samples at a time
data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10))

# Pad the examples
data_stream = Padding(data_stream)
data_stream = Mapping(data_stream, _transpose)

# Initialisation settings

m.weights_init = IsotropicGaussian(0.1)
m.biases_init = Constant(0.0)
m.push_initialization_config()
m.encoder.weights_init = Orthogonal()
m.generator.transition.weights_init = Orthogonal()

# Build the cost computation graph
chars = tensor.lmatrix("features")
Exemple #28
0
def get_tr_stream(src_vocab,
                  trg_vocab,
                  src_files,
                  trg_files,
                  encoding='UTF-8',
                  preprocess=to_lower_case,
                  src_vocab_size=30000,
                  trg_vocab_size=30000,
                  eos='</S>',
                  eos_id=0,
                  unk='<UNK>',
                  unk_id=1,
                  seq_len=50,
                  batch_size=80,
                  sort_k_batches=12,
                  **kwargs):
    """Prepares the training data stream."""

    src_dataset = TextFile(src_files,
                           src_vocab,
                           preprocess=preprocess,
                           bos_token=None,
                           eos_token=eos,
                           unk_token=unk,
                           encoding=encoding)
    trg_dataset = TextFile(trg_files,
                           trg_vocab,
                           preprocess=preprocess,
                           bos_token=None,
                           eos_token=eos,
                           unk_token=unk,
                           encoding=encoding)

    src_data_stream = DataStream(src_dataset)
    trg_data_stream = DataStream(trg_dataset)

    # Replace out of vocabulary tokens with unk token
    if src_vocab_size < len(src_vocab):
        src_data_stream = Mapping(
            src_data_stream,
            _oov_to_unk(vocab_size=src_vocab_size, unk_id=unk_id))

    if trg_vocab_size < len(trg_vocab):
        trg_data_stream = Mapping(
            trg_data_stream,
            _oov_to_unk(vocab_size=trg_vocab_size, unk_id=unk_id))

    # Merge them to get a source, target pair
    stream = Merge([src_data_stream, trg_data_stream], ('source', 'target'))

    # Filter sequences that are too long (either source or target)
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length(target_source_index=1)))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    stream = _PaddingWithToken(stream, eos_id)

    # Attach one-hot ground truth data stream
    stream = Mapping(stream,
                     _to_one_hot(target_source_index=2,
                                 vacabuary_size=trg_vocab_size),
                     add_sources=("one_hot_ground_truth", ))

    return stream
Exemple #29
0
 def test_strictness_1(self):
     stream = DataStream(IterableDataset([1, 2, 3, 4, 5]))
     transformer = Batch(stream, ConstantScheme(2), strictness=1)
     assert_equal(list(transformer.get_epoch_iterator()),
                  [(numpy.array([1, 2]),), (numpy.array([3, 4]),)])
Exemple #30
0
def get_tr_stream_withContext_withPosTag(src_vocab,
                                         trg_vocab,
                                         ctx_datas,
                                         posTag_datas,
                                         ctx_num,
                                         src_data,
                                         trg_data,
                                         src_vocab_size=30000,
                                         trg_vocab_size=30000,
                                         unk_id=1,
                                         seq_len=50,
                                         batch_size=80,
                                         sort_k_batches=12,
                                         **kwargs):
    """Prepares the training data stream."""

    # Load dictionaries and ensure special tokens exist
    src_vocab = _ensure_special_tokens(src_vocab if isinstance(
        src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')),
                                       bos_idx=0,
                                       eos_idx=src_vocab_size - 1,
                                       unk_idx=unk_id)
    trg_vocab = _ensure_special_tokens(trg_vocab if isinstance(
        trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')),
                                       bos_idx=0,
                                       eos_idx=trg_vocab_size - 1,
                                       unk_idx=unk_id)

    # Get text files from both source and target
    ctx_datasets = []
    posTag_datasets = []
    for i in range(ctx_num):
        ctx_datasets.append(TextFile([ctx_datas[i]], src_vocab, None))
        posTag_datasets.append(TextFile([posTag_datas[i]], src_vocab, None))
    posTag_datasets.append(TextFile([posTag_datas[ctx_num]], src_vocab, None))
    src_dataset = TextFile([src_data], src_vocab, None)
    trg_dataset = TextFile([trg_data], trg_vocab, None)

    # Merge them to get a source, target pair
    stream = Merge(
        [i.get_example_stream() for i in ctx_datasets] +
        [i.get_example_stream() for i in posTag_datasets] +
        [src_dataset.get_example_stream(),
         trg_dataset.get_example_stream()],
        tuple('context_' + str(i) for i in range(ctx_num)) +
        tuple('context_posTag_' + str(i)
              for i in range(ctx_num)) + ('source_posTag', 'source', 'target'))

    # Filter sequences that are too long
    stream = Filter(stream, predicate=_too_long(seq_len=seq_len))

    #Replace out of vocabulary tokens with unk token
    stream = Mapping(
        stream,
        _oov_to_unk_posTag(ctx_num=ctx_num,
                           src_vocab_size=src_vocab_size,
                           trg_vocab_size=trg_vocab_size,
                           unk_id=unk_id))

    # Build a batched version of stream to read k batches ahead
    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_k_batches))

    # Sort all samples in the read-ahead batch
    stream = Mapping(stream, SortMapping(_length))

    # Convert it into a stream again
    stream = Unpack(stream)

    # Construct batches from the stream with specified batch size
    stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size))

    # Pad sequences that are short
    masked_stream = PaddingWithEOSContext(
        stream, [src_vocab_size - 1
                 for i in range(2 * ctx_num + 2)] + [trg_vocab_size - 1])

    return masked_stream