def get_test_stream_withContext_withPosTag_grd(test_ctx_datas=None, test_posTag_datas=None, test_set_source=None, test_set_target=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1, ctx_num=3, batch_size=80, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if test_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) print test_set_source, type(src_vocab) # Get text files from both source and target ctx_datasets = [] posTag_datasets = [] for i in range(ctx_num): ctx_datasets.append(TextFile([test_ctx_datas[i]], src_vocab, None)) posTag_datasets.append( TextFile([test_posTag_datas[i]], src_vocab, None)) posTag_datasets.append( TextFile([test_posTag_datas[ctx_num]], src_vocab, None)) src_dataset = TextFile([test_set_source], src_vocab, None) trg_dataset = TextFile([test_set_target], trg_vocab, None) # Merge them to get a source, target pair dev_stream = Merge([i.get_example_stream() for i in ctx_datasets] + [i.get_example_stream() for i in posTag_datasets] + [ src_dataset.get_example_stream(), trg_dataset.get_example_stream() ], tuple('context_' + str(i) for i in range(ctx_num)) + tuple('context_posTag_' + str(i) for i in range(ctx_num)) + ('source_posTag', 'source', 'target')) stream = Mapping( dev_stream, _oov_to_unk_posTag(ctx_num=ctx_num, src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size=batch_size)) masked_stream = PaddingWithEOSContext(stream, [ src_vocab_size - 1 for i in range(2 * ctx_num + 2), trg_vocab_size - 1 ]) return masked_stream
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) iteration_scheme = None if self.use_iteration_scheme: if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream(dataset, iteration_scheme=iteration_scheme) # Transformations before rearrangement labels_source = self.sources_map['labels'] if self.add_eos: stream = _AddLabel(stream, self.eos_label, which_sources=[labels_source]) if self.add_bos: if self.bos_label is None: raise Exception('No bos label given') stream = _AddLabel(stream, self.bos_label, append=False, times=self.add_bos, which_sources=[labels_source]) if self.clip_length: stream = _Clip(stream, self.clip_length, force_eos=self.eos_label if self.force_eos_when_clipping else None, which_sources=[labels_source]) # More efficient packing of examples in batches if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_Length(index=0))) stream = Unpack(stream) stream = Rearrange( stream, dict_subset(self.sources_map, self.default_sources + list(add_sources))) # Tranformations after rearrangement if self.corrupt_sources: # Can only corrupt sources with the same alphabet # as labels for source, prob in zip(self.corrupt_sources['names'], self.corrupt_sources['probs']): stream = _Corrupt(stream, prob, self.token_map(source), self.eos_label, which_sources=[source]) if self.max_length and part == 'train': # Filtering by the maximum length is only done # for the training set. self.length_filter = _LengthFilter(indices=[ i for i, source in enumerate(stream.sources) if source in self.filter_by ], max_length=self.max_length) stream = Filter(stream, self.length_filter) stream = ForceFloatX(stream) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def setUp(self): self.stream = Batch( DataStream(IterableDataset(range(100))), ConstantScheme(11))
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, bos_token=None, **kwargs): """Prepares the training data stream.""" if type(bos_token) is str: bos_token = bos_token.decode('utf8') # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, bos_token=bos_token, eos_token=u'</S>', unk_token=u'<UNK>', encoding='utf8') trg_dataset = TextFile([trg_data], trg_vocab, bos_token=bos_token, eos_token=u'</S>', unk_token=u'<UNK>', encoding='utf8') # Merge them to get a source, target pair stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? stream = Mapping( stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Now make a very big batch that we can shuffle shuffle_batch_size = kwargs.get('shuffle_batch_size', 1000) stream = Batch(stream, iteration_scheme=ConstantScheme(shuffle_batch_size)) stream = ShuffleBatchTransformer(stream) # unpack it again stream = Unpack(stream) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream, src_vocab, trg_vocab
data_paths = [ '/home/andrewsm/SEER/external/CoNLL2003/ner/eng.train', ] # 3.3Mb file ## Achieved result: 50-epochs (GPU) training on eng.train => testb overall scores : ## accuracy: 96.42%; precision: 76.95%; recall: 80.26%; FB1: 78.57 dataset = CoNLLTextFile(data_paths, dictionary=word2code, unknown_token='<UNK>') data_stream = DataStream(dataset) data_stream = Filter(data_stream, _filter_long) #data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(mini_batch_size)) #data_stream = Padding(data_stream, mask_sources=('tokens')) # Adds a mask fields to this stream field, type='floatX' data_stream = Padding( data_stream, ) # Adds a mask fields to all of this stream's fields, type='floatX' data_stream = Mapping( data_stream, _transpose ) # Flips stream so that sentences run down columns, batches along rows (strangely) if False: # print sample for debugging Dataset / DataStream component #t=0 max_len = 0 for i, data in enumerate(data_stream.get_epoch_iterator()): #print(i) #t=t + data[4].sum() + data[0].shape[1]
def try_strict(strictness): return list( Batch(stream, ConstantScheme(2), strictness=strictness).get_epoch_iterator())
def get_src_trg_stream(cg, config, src_datasets=None, trg_datasets=None, is_training=True, src_vocabs=None, trg_vocabs=None, logprob_datasets=None): eid, did = p_(cg) if is_training: logger.info(' ... src:[{}] - [{}]'.format(eid, src_datasets[cg].files[0])) logger.info(' ... trg:[{}] - [{}]'.format(did, trg_datasets[cg].files[0])) stream = Merge([ src_datasets[cg].get_example_stream(), trg_datasets[cg].get_example_stream() ], ('source', 'target')) stream = Filter(stream, predicate=_too_long(config['seq_len'])) if 'min_seq_lens' in config and config['min_seq_lens'][cg] > 0: stream = Filter(stream, predicate=_too_short(config['min_seq_lens'][cg])) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) stream = Batch( stream, iteration_scheme=ConstantScheme(config['batch_sizes'][cg] * config['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme( config['batch_sizes'][cg])) else: # logprob stream src_dataset = TextFile([logprob_datasets[cg][0]], src_vocabs[p_(cg)[0]], None) trg_dataset = TextFile([logprob_datasets[cg][1]], trg_vocabs[p_(cg)[1]], None) stream = Merge([ src_dataset.get_example_stream(), trg_dataset.get_example_stream() ], ('source', 'target')) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs)) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) return masked_stream
def get_tr_stream_with_context_features(src_vocab, trg_vocab, src_data, trg_data, context_features, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" def _get_np_array(filename): return numpy.load(filename)['arr_0'] # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # now add the source with the image features # create the image datastream (iterate over a file line-by-line) train_features = _get_np_array(context_features) train_feature_dataset = IterableDataset(train_features) train_image_stream = DataStream(train_feature_dataset) stream = Merge([stream, train_image_stream], ('source', 'target', 'initial_context')) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1], mask_sources=('source', 'target')) return masked_stream, src_vocab, trg_vocab
def get_stream(source, target, source_input_dict, target_label_dict, batch_size, buffer_multiplier=100, input_token_level='word', n_input_tokens=0, n_labels=0, reverse_labels=False, max_input_length=None, max_label_length=None, pad_labels=True, is_sort=True): """Returns a stream over sentence pairs. Parameters ---------- source : list A list of files to read source languages from. target : list A list of corresponding files in the target language. source_word_dict : str Path to a tab-delimited text file whose last column contains the vocabulary. target_label_dict : str See `source_char_dict`. batch_size : int The minibatch size. buffer_multiplier : int The number of batches to load, concatenate, sort by length of source sentence, and split again; this makes batches more uniform in their sentence length and hence more computationally efficient. n_source_words : int The number of words in the source vocabulary. Pass 0 (default) to use the entire vocabulary. n_target_labels : int See `n_chars_source`. """ if len(source) != len(target): raise ValueError("number of source and target files don't match") # Read the dictionaries dicts = [ load_dict(source_input_dict, dict_size=n_input_tokens), load_dict(target_label_dict, dict_size=n_labels, reverse=reverse_labels, include_unk=False) ] # Open the two sets of files and merge them streams = [ TextFile(source, dicts[0], level=input_token_level, bos_token=None, eos_token=EOS_TOKEN, encoding='utf-8').get_example_stream(), TextFile(target, dicts[1], level='word', bos_token=None, unk_token=None, eos_token=EOS_TOKEN, encoding='utf-8').get_example_stream() ] merged = Merge(streams, ('source_input_tokens', 'target_labels')) if reverse_labels: merged = SortLabels(merged) # Filter sentence lengths if max_input_length or max_label_length: def filter_pair(pair): src_input_tokens, trg_labels = pair src_input_ok = (not max_input_length) or \ len(src_input_tokens) <= (max_input_length + 1) trg_label_ok = (not max_label_length) or \ len(trg_labels) <= (max_label_length + 1) return src_input_ok and trg_label_ok merged = Filter(merged, filter_pair) # Batches of approximately uniform size large_batches = Batch(merged, iteration_scheme=ConstantScheme(batch_size * buffer_multiplier)) # sorted_batches = Mapping(large_batches, SortMapping(_source_length)) # batches = Cache(sorted_batches, ConstantScheme(batch_size)) # shuffled_batches = Shuffle(batches, buffer_multiplier) # masked_batches = Padding(shuffled_batches, # mask_sources=('source_chars', 'target_labels')) if is_sort: sorted_batches = Mapping(large_batches, SortMapping(_source_length)) else: sorted_batches = large_batches batches = Cache(sorted_batches, ConstantScheme(batch_size)) mask_sources = ('source_input_tokens', 'target_labels') masked_batches = Padding(batches, mask_sources=mask_sources) return masked_batches
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, dict_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist ''' actual_src_vocab_num = len(src_vocab) actual_trg_vocab_num = len(trg_vocab) src_vocab = ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=(actual_src_vocab_num - 1) if actual_src_vocab_num - 3 < src_vocab_size else (src_vocab_size + 3 - 1), unk_idx=unk_id) trg_vocab = ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=(actual_trg_vocab_num - 1) if actual_trg_vocab_num - 3 < trg_vocab_size else (trg_vocab_size + 3 - 1), unk_idx=unk_id) ''' src_vocab = ensure_special_tokens(src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = ensure_special_tokens(trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # for example: # source: 第五 章 罚则 # target: chapter v penalty regulations # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab) trg_dataset = TextFile([trg_data], trg_vocab) dict_dataset = TextFile([dict_data], trg_vocab) # for data in DataStream(src_dataset).get_epoch_iterator(): # print(data) # looks like: ([0, 1649, 1764, 7458, 29999],) # Merge them to get a source, target pair stream = Merge([ src_dataset.get_example_stream(), trg_dataset.get_example_stream(), dict_dataset.get_example_stream() ], ('source', 'target', 'dict')) # data_stream.sources = 'source' or 'target' ''' print 'init \n' num_before_filter = 0 for data in stream.get_epoch_iterator(): num_before_filter = num_before_filter + 1 # print(data) ''' # looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999]) # Filter sequences that are too long # Neither source sentence or target sentence can beyond the length seq_len # the lenght include the start symbol <s> and the end symbol </s>, so the actual sentence # length can not beyond (seq_len - 2) stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) ''' num_after_filter = 0 # print 'after filter ... \n' for data in stream.get_epoch_iterator(): num_after_filter = num_after_filter + 1 # print(data) logger.info('\tby filtering, sentence-pairs from {} to {}.'.format(num_before_filter, num_after_filter)) logger.info('\tfilter {} sentence-pairs whose source or target sentence exceeds {} words'.format( (num_before_filter - num_after_filter), seq_len)) ''' # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # do not need ''' print 'after mapping unk ...' for data in stream.get_epoch_iterator(): print(data) ''' # still looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999]) # Build a batched version of stream to read k batches ahead # do not sort on the whole training data, first split the training data into several blocks, # each block contain (batch_size*sort_k_batches) sentence-pairs, we juse sort in each block, # finally, i understand !!!!!!! # remainder stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) ''' print 'after sorted batch ... ' for data in stream.get_epoch_iterator(): print(data) ''' # Sort all samples in the read-ahead batch # sort by the length of target sentence in (batch_size*sort_k_batches) # list for all training data, speed up stream = Mapping(stream, SortMapping(_length)) ''' print 'after sort ... ' for data in stream.get_epoch_iterator(): print(data) ''' # Convert it into a stream again stream = Unpack(stream) ''' print 'after unpack ... ' for data in stream.get_epoch_iterator(): print(data) ''' # still looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999]) # remove the remainder ? # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # after sort, each batch has batch_size sentence pairs ''' print 'after final batch ... ' i = 0 for data in stream.get_epoch_iterator(): i = i + 1 print(data) print 'batchs: ', i ''' # Pad sequences that are short masked_stream = PaddingWithEOS( stream, bos_idx=[0, 0, 0], eos_idx=[src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1]) # print 'after padding with mask ...' return masked_stream
def test_adds_batch_to_axis_labels(self): stream = DataStream( IterableDataset({'features': [1, 2, 3, 4, 5]}, axis_labels={'features': ('index', )})) transformer = Batch(stream, ConstantScheme(2), strictness=0) assert_equal(transformer.axis_labels, {'features': ('batch', 'index')})
def get_stream(self, part, batch_size=None, shuffle=False, max_length=None, raw_text=False, q_ids=False, seed=None, dataset=None): if not seed: seed = fuel.config.default_seed rng = numpy.random.RandomState(seed) if not dataset: dataset = self.get_dataset(part) if shuffle: stream = DataStream(dataset, iteration_scheme=ShuffledExampleScheme( dataset.num_examples, rng=rng)) else: stream = dataset.get_example_stream() if not q_ids: stream = FilterSources( stream, [source for source in dataset.sources if source != 'q_ids']) else: stream = SourcewiseMapping(stream, _str2vec, which_sources=('q_ids')) stream = PutTextTransfomer(stream, dataset, raw_text=True) # <eos> is added for two purposes: to serve a sentinel for coattention, # and also to ensure the answer span ends at a token eos = self.vocab.EOS stream = SourcewiseMapping(stream, functools.partial(add_eos, eos), which_sources=('contexts', 'questions')) stream = Mapping(stream, functools.partial(select_random_answer, rng), mapping_accepts=dict) if not batch_size: if self._retrieval: raise NotImplementedError() return stream if raw_text: stream = Mapping(stream, keep_text, mapping_accepts=dict, add_sources=('contexts_text', 'questions_text')) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) if self._retrieval: stream = Mapping(stream, functools.partial(retrieve_and_pad_squad, self._retrieval), mapping_accepts=dict, add_sources=('defs', 'def_mask', 'contexts_def_map', 'questions_def_map')) stream = SourcewiseMapping(stream, functools.partial(digitize, self.vocab), which_sources=('contexts', 'questions')) stream = Padding(stream, mask_sources=['contexts', 'questions'] + (['contexts_text'] if raw_text else [])) return stream
def get_stream(source, target, source_dict, target_dict, batch_size, buffer_multiplier=100, n_words_source=0, n_words_target=0, max_src_length=None, max_trg_length=None): """Returns a stream over sentence pairs. Parameters ---------- source : list A list of files to read source languages from. target : list A list of corresponding files in the target language. source_dict : str Path to a tab-delimited text file whose last column contains the vocabulary. target_dict : str See `source_dict`. batch_size : int The minibatch size. buffer_multiplier : int The number of batches to load, concatenate, sort by length of source sentence, and split again; this makes batches more uniform in their sentence length and hence more computationally efficient. n_words_source : int The number of words in the source vocabulary. Pass 0 (default) to use the entire vocabulary. n_words_target : int See `n_words_source`. """ if len(source) != len(target): raise ValueError("number of source and target files don't match") # Read the dictionaries dicts = [ load_dict(source_dict, n_words=n_words_source), load_dict(target_dict, n_words=n_words_target) ] # Open the two sets of files and merge them streams = [ TextFile(source, dicts[0], bos_token=None, eos_token=EOS_TOKEN).get_example_stream(), TextFile(target, dicts[1], bos_token=None, eos_token=EOS_TOKEN).get_example_stream() ] merged = Merge(streams, ('source', 'target')) # Filter sentence lengths if max_src_length or max_trg_length: def filter_pair(pair): src, trg = pair src_ok = (not max_src_length) or len(src) < max_src_length trg_ok = (not max_trg_length) or len(trg) < max_trg_length return src_ok and trg_ok merged = Filter(merged, filter_pair) # Batches of approximately uniform size large_batches = Batch(merged, iteration_scheme=ConstantScheme(batch_size * buffer_multiplier)) sorted_batches = Mapping(large_batches, SortMapping(_source_length)) batches = Cache(sorted_batches, ConstantScheme(batch_size)) shuffled_batches = Shuffle(batches, buffer_multiplier) masked_batches = Padding(shuffled_batches) return masked_batches
logger.info('Saving the main loop...') dump_manager = MainLoopDumpManager(save_location) dump_manager.dump(main_loop) logger.info('Saved') if __name__ == "__main__": # Test cost = construct_model(50000, 256, 100, Tanh()) vocabulary = get_vocabulary(50000) rare, freq = frequencies(vocabulary, 2000, 100) # Build training and validation datasets train_stream = Padding( Batch(Mapping(get_sentence_stream('training', [1], vocabulary), add_frequency_all, add_sources=("frequency_mask", )), iteration_scheme=ConstantScheme(64))) valid_stream = Padding( Batch(Mapping(get_sentence_stream('heldout', [1], vocabulary), add_frequency_all, add_sources=("frequency_mask", )), iteration_scheme=ConstantScheme(256))) valid_freq = Padding( Batch(Mapping(get_sentence_stream('heldout', [1], vocabulary), add_frequency_mask(freq), add_sources=("frequency_mask", )), iteration_scheme=ConstantScheme(256))) valid_rare = Padding(
def _get_vl_stream(src_vocab, trg_vocab, src_files, trg_files_list, encoding='UTF-8', preprocess=to_lower_case, src_vocab_size=30000, trg_vocab_size=30000, eos='</S>', eos_id=0, unk='<UNK>', unk_id=1, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the validation/test data stream.""" src_dataset = TextFile(src_files, src_vocab, preprocess=preprocess, bos_token=None, eos_token=eos, unk_token=unk, encoding=encoding) trg_dataset_list = [ TextFile(trg_files, trg_vocab, preprocess=preprocess, bos_token=None, eos_token=None, unk_token=unk, encoding=encoding) for trg_files in trg_files_list ] src_data_stream = DataStream(src_dataset) trg_data_stream_list = [ DataStream(trg_dataset) for trg_dataset in trg_dataset_list ] # Replace out of vocabulary tokens with unk token if src_vocab_size < len(src_vocab): src_data_stream = Mapping( src_data_stream, _oov_to_unk(vocab_size=src_vocab_size, unk_id=unk_id)) if trg_vocab_size < len(trg_vocab): trg_data_stream_list = [ Mapping(trg_data_stream, _oov_to_unk(vocab_size=trg_vocab_size, unk_id=unk_id)) for trg_data_stream in trg_data_stream_list ] # Merge them to get a source, multiple references stream = Merge( [src_data_stream] + trg_data_stream_list, ('source', ) + tuple(['reference_%d' % i for i in range(len(trg_data_stream_list))])) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length(target_source_index=0))) # Convert it into a stream again stream = Unpack(stream) # TODO: create dynamic batches, larger batch size for shorter sentences, while smaller for longer sentence # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short stream = _PaddingWithToken(stream, eos_id, mask_sources=('source', )) return stream
[x if x < trg_vocab_size else unk_id for x in sentence_pair[1]]) def too_long(sentence_pair): return all([len(sentence) < config['seq_len'] for sentence in sentence_pair]) class CycleTextFile(TextFile): """This dataset cycles through the text files, reading a sentence from each. """ def open(self): return chain.from_iterable(izip(*[chain.from_iterable( imap(open, repeat(f))) for f in self.files])) en_dataset = CycleTextFile(en_files, cPickle.load(open(en_vocab)), None) fr_dataset = CycleTextFile(fr_files, cPickle.load(open(fr_vocab)), None) stream = Merge([en_dataset.get_example_stream(), fr_dataset.get_example_stream()], ('english', 'french')) dev_dataset = TextFile([dev_file], cPickle.load(open(en_vocab)), None) dev_stream = DataStream(dev_dataset) filtered_stream = Filter(stream, predicate=too_long) filtered_stream = Mapping(filtered_stream, _oov_to_unk) batched_stream = Batch(filtered_stream, iteration_scheme=ConstantScheme(config['batch_size'])) masked_stream = Padding(batched_stream)
lang=exp_config['target_lang'], meteor_directory=exp_config['meteor_directory']) # BLEU else: sampling_transformer = MMMTSampleStreamTransformer( sampling_func, sentence_level_bleu, num_samples=exp_config['n_samples']) training_stream = Mapping(training_stream, sampling_transformer, add_sources=('samples', 'scores')) # Build a batched version of stream to read k batches ahead training_stream = Batch( training_stream, iteration_scheme=ConstantScheme(exp_config['batch_size'] * exp_config['sort_k_batches'])) # TODO: add read-ahead shuffling Mapping similar to SortMapping # Sort all samples in the read-ahead batch training_stream = Mapping(training_stream, SortMapping(_length)) # Convert it into a stream again training_stream = Unpack(training_stream) # Construct batches from the stream with specified batch size training_stream = Batch(training_stream, iteration_scheme=ConstantScheme( exp_config['batch_size'])) # Pad sequences that are short
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weghts_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( application=generator.readout.readout, name="output")(cg.variables) (activations,) = VariableFilter( application=generator.transition.apply, name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy( param.norm(2), name + "_norm")) observables.append(named_copy( algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_param_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def get_tr_stream_predict(src_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=2, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, preprocess=get_unicode) trg_dataset = TextFile([trg_data], src_vocab, preprocess=get_unicode) #src_dataset = TextFile([src_data], src_vocab, None) #trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # TODO thius was from today< #print(type(src_dataset.get_example_stream())) #print(type(src_dataset)) #print(list(src_dataset.get_example_stream().get_epoch_iterator())) #sys.exit(0) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping( stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=src_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, [src_vocab_size - 1, src_vocab_size - 1]) return masked_stream
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=120, trg_vocab_size=120, unk_id=1, bos_token='<S>', seq_char_len=300, seq_word_len=50, batch_size=70, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else pickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else pickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFileWithSEOSS([src_data], src_vocab, None, level='character') trg_dataset = TextFileWithSEOSS([trg_data], trg_vocab, None, level='character') # Merge them to get a source, target pair stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long( unk_id=unk_id, space_idx=[src_vocab[' '], trg_vocab[' ']], seq_char_len=seq_char_len, seq_word_len=seq_word_len)) # Replace out of vocabulary tokens with unk token stream = Mapping( stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, { 'source': src_vocab[' '], 'target': trg_vocab[' '] }, trg_vocab[bos_token], mask_dtype='int8') return masked_stream
def get_logprob_streams(config): if 'log_prob_sets' not in config: return None cgs = config['cgs'] enc_ids, dec_ids = get_enc_dec_ids(cgs) datasets = config['log_prob_sets'] # Prepare source vocabs and files, make sure special tokens are there src_vocabs = { k: cPickle.load(open(v)) for k, v in config['src_vocabs'].iteritems() } for k in src_vocabs.keys(): src_vocabs[k]['<S>'] = 0 src_vocabs[k]['</S>'] = config['src_eos_idxs'][k] src_vocabs[k]['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_vocabs = { k: cPickle.load(open(v)) for k, v in config['trg_vocabs'].iteritems() } for k in trg_vocabs.keys(): trg_vocabs[k]['<S>'] = 0 trg_vocabs[k]['</S>'] = config['trg_eos_idxs'][k] trg_vocabs[k]['<UNK>'] = config['unk_id'] # Build the preprocessing pipeline for individual streams ind_streams = {} for cg in cgs: eid, did = p_(cg) if cg not in datasets: continue logger.info('Building logprob stream for cg:[{}]'.format(cg)) src_dataset = TextFile([datasets[cg][0]], src_vocabs[p_(cg)[0]], None) trg_dataset = TextFile([datasets[cg][1]], trg_vocabs[p_(cg)[1]], None) stream = Merge([ src_dataset.get_example_stream(), trg_dataset.get_example_stream() ], ('source', 'target')) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs)) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) ind_streams[cg] = masked_stream return ind_streams
def test_strictness_2_error(self): stream = DataStream(IterableDataset([1, 2, 3, 4, 5])) transformer = Batch(stream, ConstantScheme(2), strictness=2) assert_raises(ValueError, list, transformer.get_epoch_iterator())
def get_tr_stream_with_topicalq(src_vocab, trg_vocab, topical_vocab, src_data, trg_data, topical_data, src_vocab_size=30000, trg_vocab_size=30000, topical_vocab_size=2000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) topical_vocab = cPickle.load(open(topical_vocab, 'rb')) #not ensure special token. # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) topical_dataset = TextFile([topical_data], topical_vocab, None, None, '10') # Merge them to get a source, target pair stream = Merge([ src_dataset.get_example_stream(), trg_dataset.get_example_stream(), topical_dataset.get_example_stream() ], ('source', 'target', 'source_topical')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # The topical part are not contained of it, check~ stream = Mapping( stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, topical_vocab_size=topical_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1, topical_vocab_size - 1]) return masked_stream
def test_value_error_on_request_none(self): stream = DataStream(IterableDataset([1, 2, 3, 4, 5])) transformer = Batch(stream, ConstantScheme(2)) assert_raises(ValueError, transformer.get_data, None)
def setup_model_and_stream(exp_config, source_vocab, target_vocab): # TODO: this line is a mess sample_model, theano_sampling_source_input, theano_sampling_context_input, train_encoder, \ train_decoder, generated = \ get_sampling_model_and_input(exp_config) trg_vocab = target_vocab trg_vocab_size = exp_config['trg_vocab_size'] src_vocab = source_vocab src_vocab_size = exp_config['src_vocab_size'] theano_sample_func = sample_model.get_theano_function() sampling_func = SampleFunc(theano_sample_func, trg_vocab) # TODO: move stream creation to nn_imt.stream # def get_textfile_stream(source_file=None, src_vocab=None, src_vocab_size=30000, # unk_id=1, bos_token=None): src_stream = get_textfile_stream( source_file=exp_config['src_data'], src_vocab=exp_config['src_vocab'], src_vocab_size=exp_config['src_vocab_size'], unk_id=exp_config['unk_id'], bos_token='<S>') trg_stream = get_textfile_stream( source_file=exp_config['trg_data'], src_vocab=exp_config['trg_vocab'], src_vocab_size=exp_config['trg_vocab_size'], unk_id=exp_config['unk_id'], bos_token='<S>') # text file stream training_stream = Merge([src_stream, trg_stream], ('source', 'target')) # Filter sequences that are too long (Note this may break) training_stream = Filter( training_stream, predicate=_too_long(seq_len=exp_config['seq_len'])) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? training_stream = Mapping( training_stream, _oov_to_unk(src_vocab_size=exp_config['src_vocab_size'], trg_vocab_size=exp_config['trg_vocab_size'], unk_id=exp_config['unk_id'])) # add in the prefix and suffix seqs # working: add the sample ratio logger.info('Sample ratio is: {}'.format(exp_config.get( 'sample_ratio', 1.))) training_stream = Mapping( training_stream, PrefixSuffixStreamTransformer( sample_ratio=exp_config.get('sample_ratio', 1.)), add_sources=('target_prefix', 'target_suffix')) training_stream = Mapping( training_stream, CopySourceAndTargetToMatchPrefixes(training_stream)) # changing stream.produces_examples is a little hack which lets us use Unpack to flatten training_stream.produces_examples = False # flatten the stream back out into (source, target, target_prefix, target_suffix) training_stream = Unpack(training_stream) # METEOR trg_ivocab = {v: k for k, v in trg_vocab.items()} # TODO: Implement smoothed BLEU # TODO: Implement first-word accuracy (bilingual language model) min_risk_score_func = exp_config.get('min_risk_score_func', 'bleu') if min_risk_score_func == 'meteor': sampling_transformer = IMTSampleStreamTransformer( sampling_func, sentence_level_meteor, num_samples=exp_config['n_samples'], trg_ivocab=trg_ivocab, lang=exp_config['target_lang'], meteor_directory=exp_config['meteor_directory']) elif min_risk_score_func == 'imt_f1': sampling_transformer = IMTSampleStreamTransformer( sampling_func, sentence_level_imt_f1, num_samples=exp_config['n_samples']) # BLEU is default else: sampling_transformer = IMTSampleStreamTransformer( sampling_func, sentence_level_bleu, num_samples=exp_config['n_samples']) training_stream = Mapping(training_stream, sampling_transformer, add_sources=('samples', 'seq_probs', 'scores')) # now filter out segments whose samples are too good or too bad training_stream = Filter(training_stream, predicate=filter_by_sample_score) # Now make a very big batch that we can shuffle # Build a batched version of stream to read k batches ahead shuffle_batch_size = exp_config['shuffle_batch_size'] training_stream = Batch( training_stream, iteration_scheme=ConstantScheme(shuffle_batch_size)) training_stream = ShuffleBatchTransformer(training_stream) # unpack it again training_stream = Unpack(training_stream) # Build a batched version of stream to read k batches ahead batch_size = exp_config['batch_size'] sort_k_batches = exp_config['sort_k_batches'] training_stream = Batch(training_stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch training_stream = Mapping(training_stream, SortMapping(_length)) # Convert it into a stream again training_stream = Unpack(training_stream) # Construct batches from the stream with specified batch size training_stream = Batch(training_stream, iteration_scheme=ConstantScheme(batch_size)) # IDEA: add a transformer which flattens the target samples before we add the mask flat_sample_stream = FlattenSamples(training_stream) expanded_source_stream = CopySourceAndPrefixNTimes( flat_sample_stream, n_samples=exp_config['n_samples']) # Pad sequences that are short # TODO: is it correct to blindly pad the target_prefix and the target_suffix? # Note: we shouldn't need to pad the seq_probs because there is only one per sequence # TODO: DEVELOPMENT HACK exp_config['suffix_length'] = 1 exp_config['truncate_sources'] = ['target_suffix'] configurable_padding_args = { 'suffix_length': exp_config.get('suffix_length', None), 'truncate_sources': exp_config.get('truncate_sources', []) } import ipdb ipdb.set_trace() masked_stream = PaddingWithEOS(expanded_source_stream, [ src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1 ], mask_sources=('source', 'target', 'target_prefix', 'target_suffix', 'samples'), **configurable_padding_args) return train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream
def test_2d_sequences_error_on_unequal_shapes(self): stream = Batch( DataStream( IterableDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 3))])), ConstantScheme(2)) assert_raises(ValueError, next, Padding(stream).get_epoch_iterator())
m = MorphGen(100, len(Globals.char2code)) dataset_options = dict(dictionary=Globals.char2code, level="word", preprocess=_tokenise) dataset = TextFile([f_train], **dataset_options) data_stream = dataset.get_example_stream() # Read examples and look up the right surface form data_stream = Mapping(data_stream, morph_lookup, add_sources=("targets", )) # Read in 10 samples at a time data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) # Pad the examples data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialisation settings m.weights_init = IsotropicGaussian(0.1) m.biases_init = Constant(0.0) m.push_initialization_config() m.encoder.weights_init = Orthogonal() m.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features")
def get_tr_stream(src_vocab, trg_vocab, src_files, trg_files, encoding='UTF-8', preprocess=to_lower_case, src_vocab_size=30000, trg_vocab_size=30000, eos='</S>', eos_id=0, unk='<UNK>', unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" src_dataset = TextFile(src_files, src_vocab, preprocess=preprocess, bos_token=None, eos_token=eos, unk_token=unk, encoding=encoding) trg_dataset = TextFile(trg_files, trg_vocab, preprocess=preprocess, bos_token=None, eos_token=eos, unk_token=unk, encoding=encoding) src_data_stream = DataStream(src_dataset) trg_data_stream = DataStream(trg_dataset) # Replace out of vocabulary tokens with unk token if src_vocab_size < len(src_vocab): src_data_stream = Mapping( src_data_stream, _oov_to_unk(vocab_size=src_vocab_size, unk_id=unk_id)) if trg_vocab_size < len(trg_vocab): trg_data_stream = Mapping( trg_data_stream, _oov_to_unk(vocab_size=trg_vocab_size, unk_id=unk_id)) # Merge them to get a source, target pair stream = Merge([src_data_stream, trg_data_stream], ('source', 'target')) # Filter sequences that are too long (either source or target) stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length(target_source_index=1))) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short stream = _PaddingWithToken(stream, eos_id) # Attach one-hot ground truth data stream stream = Mapping(stream, _to_one_hot(target_source_index=2, vacabuary_size=trg_vocab_size), add_sources=("one_hot_ground_truth", )) return stream
def test_strictness_1(self): stream = DataStream(IterableDataset([1, 2, 3, 4, 5])) transformer = Batch(stream, ConstantScheme(2), strictness=1) assert_equal(list(transformer.get_epoch_iterator()), [(numpy.array([1, 2]),), (numpy.array([3, 4]),)])
def get_tr_stream_withContext_withPosTag(src_vocab, trg_vocab, ctx_datas, posTag_datas, ctx_num, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target ctx_datasets = [] posTag_datasets = [] for i in range(ctx_num): ctx_datasets.append(TextFile([ctx_datas[i]], src_vocab, None)) posTag_datasets.append(TextFile([posTag_datas[i]], src_vocab, None)) posTag_datasets.append(TextFile([posTag_datas[ctx_num]], src_vocab, None)) src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge( [i.get_example_stream() for i in ctx_datasets] + [i.get_example_stream() for i in posTag_datasets] + [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], tuple('context_' + str(i) for i in range(ctx_num)) + tuple('context_posTag_' + str(i) for i in range(ctx_num)) + ('source_posTag', 'source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) #Replace out of vocabulary tokens with unk token stream = Mapping( stream, _oov_to_unk_posTag(ctx_num=ctx_num, src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOSContext( stream, [src_vocab_size - 1 for i in range(2 * ctx_num + 2)] + [trg_vocab_size - 1]) return masked_stream