def _get_align_stream(src_data, trg_data, src_vocab_size, trg_vocab_size, seq_len, **kwargs): """Creates the stream which is used for the main loop. Args: src_data (string): Path to the source sentences trg_data (string): Path to the target sentences src_vocab_size (int): Size of the source vocabulary in the NMT model trg_vocab_size (int): Size of the target vocabulary in the NMT model seq_len (int): Maximum length of any source or target sentence Returns: ExplicitNext. Alignment data stream which can be iterated explicitly """ # Build dummy vocabulary to make TextFile happy src_vocab = _add_special_ids({str(i) : i for i in xrange(src_vocab_size)}) trg_vocab = _add_special_ids({str(i) : i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) s = Batch(s, iteration_scheme=ConstantScheme(1)) masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return ExplicitNext(masked_stream)
def get_dev_stream_with_context_features(val_context_features=None, val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" def _get_np_array(filename): return numpy.load(filename)['arr_0'] dev_stream = None if val_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) # now add the source with the image features # create the image datastream (iterate over a file line-by-line) con_features = _get_np_array(val_context_features) con_feature_dataset = IterableDataset(con_features) valid_image_stream = DataStream(con_feature_dataset) # dev_stream = DataStream(dev_dataset) dev_stream = Merge([dev_dataset.get_example_stream(), valid_image_stream], ('source', 'initial_context')) # dev_stream = dev_stream.get_example_stream() return dev_stream
def get_test_stream(sfiles, svocab_dict): dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') stream = Merge([dataset.get_example_stream(),], ('source', )) stream = Batch( stream, iteration_scheme=ConstantScheme(10)) stream = Padding(stream) return stream
def get_logprob_streams(config): if 'log_prob_sets' not in config: return None cgs = config['cgs'] enc_ids, dec_ids = get_enc_dec_ids(cgs) datasets = config['log_prob_sets'] # Prepare source vocabs and files, make sure special tokens are there src_vocabs = {k: cPickle.load(open(v)) for k, v in config['src_vocabs'].iteritems()} for k in src_vocabs.keys(): src_vocabs[k]['<S>'] = 0 src_vocabs[k]['</S>'] = config['src_eos_idxs'][k] src_vocabs[k]['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_vocabs = {k: cPickle.load(open(v)) for k, v in config['trg_vocabs'].iteritems()} for k in trg_vocabs.keys(): trg_vocabs[k]['<S>'] = 0 trg_vocabs[k]['</S>'] = config['trg_eos_idxs'][k] trg_vocabs[k]['<UNK>'] = config['unk_id'] # Build the preprocessing pipeline for individual streams ind_streams = {} for cg in cgs: eid, did = p_(cg) if cg not in datasets: continue logger.info('Building logprob stream for cg:[{}]'.format(cg)) src_dataset = TextFile([datasets[cg][0]], src_vocabs[p_(cg)[0]], None) trg_dataset = TextFile([datasets[cg][1]], trg_vocabs[p_(cg)[1]], None) stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream = Mapping(stream, _oov_to_unk( src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs)) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx( [(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) ind_streams[cg] = masked_stream return ind_streams
def get_src_trg_stream(cg, config, src_datasets=None, trg_datasets=None, is_training=True, src_vocabs=None, trg_vocabs=None, logprob_datasets=None): eid, did = p_(cg) if is_training: logger.info(' ... src:[{}] - [{}]'.format( eid, src_datasets[cg].files[0])) logger.info(' ... trg:[{}] - [{}]'.format( did, trg_datasets[cg].files[0])) stream = Merge([src_datasets[cg].get_example_stream(), trg_datasets[cg].get_example_stream()], ('source', 'target')) stream = Filter(stream, predicate=_too_long(config['src_seq_len'], config['tgt_seq_len'])) if 'min_seq_lens' in config and config['min_seq_lens'][cg] > 0: stream = Filter(stream, predicate=_too_short(config['min_seq_lens'][cg])) stream = Mapping(stream, _oov_to_unk( src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) stream = Batch( stream, iteration_scheme=ConstantScheme( config['batch_sizes'][cg]*config['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme( config['batch_sizes'][cg])) else: # logprob stream src_dataset = TextFile([logprob_datasets[cg][0]], src_vocabs[p_(cg)[0]], None) trg_dataset = TextFile([logprob_datasets[cg][1]], trg_vocabs[p_(cg)[1]], None) stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream = Mapping(stream, _oov_to_unk( src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs)) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx( [(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) return masked_stream
def get_devtest_stream(data_type='valid', input_file=None, **kwards): if data_type == 'valid': data_file = kwards.pop('valid_src') data_file_hist = kwards.pop('valid_src_hist') elif data_type == 'test': if input_file is None: data_file = kwards.pop('test_src') else: data_file = input_file # added by Longyue data_file_hist = kwards.pop('test_src_hist') else: logger.error('wrong datatype, which must be one of valid or test') unk_token = kwards.pop('unk_token') eos_token = kwards.pop('eos_token') vocab_src = kwards.pop('vocab_src') dataset = TextFile(files=[data_file], dictionary=pkl.load(open(vocab_src, 'rb')), level='word', unk_token=unk_token, bos_token=None, eos_token=eos_token) dev_stream = DataStream(dataset) # added by Longyue hist_len = 3 dev_stream_hist = [] for idx in range(hist_len): dataset_hist = TextFile(files=[data_file_hist + str(idx)], dictionary=pkl.load(open(vocab_src, 'rb')), level='word', unk_token=unk_token, bos_token=None, eos_token=eos_token) dev_stream_hist.append(DataStream(dataset_hist)) dev_stream_hist_combine = [] for d_s in dev_stream_hist: for item in d_s.get_epoch_iterator(): dev_stream_hist_combine.append(item) item_len = len(dev_stream_hist_combine) dev_stream_hist_split = [] for i in range(item_len / hist_len): tmp = [] for j in range(hist_len): tmp.append(dev_stream_hist_combine[i + item_len / hist_len * j]) dev_stream_hist_split.append(tmp) dev_stream_hist_split = tuple(dev_stream_hist_split) return dev_stream, dev_stream_hist_split
def get_stream(vocab, data, vocab_size, unk_id, eos_id, bos_id, noise=0): vocab = get_vocab(vocab, vocab_size, unk_id, eos_id, bos_id) # Maps words to their index in the vocabulary. OOV words are replaced by <UNK> index. # Also appends </S> index at the end. No <S> token (TODO: bos_id parameter useless). dataset = TextFile([data], vocab, None) stream = Mapping(dataset.get_example_stream(), _add_noise(noise)) stream.dataset = dataset # for backward-compatibility return stream
def get_tr_stream(config): cgs = config['cgs'] enc_ids, dec_ids = get_enc_dec_ids(cgs) # Prepare source vocabs and files, make sure special tokens are there src_files = config['src_datas'] src_vocabs = { k: cPickle.load(open(v)) for k, v in config['src_vocabs'].iteritems() } for k in src_vocabs.keys(): src_vocabs[k]['<S>'] = 0 src_vocabs[k]['</S>'] = config['src_eos_idxs'][k] src_vocabs[k]['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_files = config['trg_datas'] trg_vocabs = { k: cPickle.load(open(v)) for k, v in config['trg_vocabs'].iteritems() } for k in trg_vocabs.keys(): trg_vocabs[k]['<S>'] = 0 trg_vocabs[k]['</S>'] = config['trg_eos_idxs'][k] trg_vocabs[k]['<UNK>'] = config['unk_id'] # Create individual source streams src_datasets = { cg: TextFile([src_files[cg]], src_vocabs[p_(cg)[0]], None) for cg in cgs } # Create individial target streams trg_datasets = { cg: TextFile([trg_files[cg]], trg_vocabs[p_(cg)[1]], None) for cg in cgs } # Build the preprocessing pipeline for individual streams ind_streams = {} for cg in cgs: logger.info('Building training stream for cg:[{}]'.format(cg)) masked_stream = get_src_trg_stream(cg, config, src_datasets, trg_datasets) ind_streams[cg] = masked_stream # Scheduler and meta-controller multi_enc_stream = MultiEncStream(ind_streams, schedule=config['schedule'], batch_sizes=config['batch_sizes'], transpose=True, start_after=config.get( 'start_after', None)) return multi_enc_stream
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def get_tr_stream_single_score(src_vocab, src_data, trg_data, src_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) #trg_dataset = TextFile([trg_data], trg_vocab, None) should check the src_dataset's dimension. # Merge them to get a source, target pair stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, src_vocab_size - 1) return masked_stream
def get_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, bos_token=None, **kwargs): """Setup development set stream if necessary.""" if type(bos_token) is str: bos_token = bos_token.decode('utf8') dev_stream = None if val_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, bos_token=bos_token, eos_token=u'</S>', unk_token=u'<UNK>', encoding='utf8') dev_stream = DataStream(dev_dataset) return dev_stream
def get_devtest_stream(data_type='valid', input_file=None, **kwards): if data_type == 'valid': data_file = kwards.pop('valid_src') elif data_type == 'test': if input_file is None: data_file = kwards.pop('test_src') else: data_file = input_file else: logger.error('wrong datatype, which must be one of valid or test') unk_token = kwards.pop('unk_token') eos_token = kwards.pop('eos_token') vocab_src = kwards.pop('vocab_src') dataset = TextFile(files=[data_file], encoding='UTF-8', preprocess=to_lower_case, dictionary=pkl.load(open(vocab_src, 'rb')), level='word', unk_token=unk_token, bos_token=None, eos_token=eos_token) dev_stream = DataStream(dataset) return dev_stream
def get_log_prob_stream(cg, config): eid, did = p_(cg) dataset = config['log_prob_sets'][cg] # Prepare source vocabs and files, make sure special tokens are there src_vocab = cPickle.load(open(config['src_vocabs'][eid])) src_vocab['<S>'] = 0 src_vocab['</S>'] = config['src_eos_idxs'][eid] src_vocab['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_vocab = cPickle.load(open(config['trg_vocabs'][did])) trg_vocab['<S>'] = 0 trg_vocab['</S>'] = config['trg_eos_idxs'][did] trg_vocab['<UNK>'] = config['unk_id'] # Build the preprocessing pipeline for individual streams logger.info('Building logprob stream for cg:[{}]'.format(cg)) src_dataset = TextFile([dataset[0]], src_vocab, None) trg_dataset = TextFile([dataset[1]], trg_vocab, None) stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream = Mapping(stream, _oov_to_unk( src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch( stream, iteration_scheme=ConstantScheme( bs, num_examples=get_num_lines(dataset[0]))) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx( [(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) return masked_stream
def get_sgnmt_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the unshuffled training data stream. This corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i) : i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i) : i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping(s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) dev_stream = DataStream(dev_dataset) return dev_stream
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict): s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') # Merge stream = Merge([s_dataset.get_example_stream(), t_dataset.get_example_stream()], ('source', 'target')) # Filter -- TODO stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len'])) # Map - no need # Batch - Sort stream = Batch(stream, iteration_scheme=ConstantScheme( configuration['batch_size']*configuration['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch( stream, iteration_scheme=ConstantScheme(configuration['batch_size'])) # Pad # Note that </s>=0. Fuel only allows padding 0 by default masked_stream = Padding(stream) return masked_stream
def get_dev_stream(val_set=None, valid_sent_dict=None, src_vocab=None, trg_vocab=None, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and src_vocab is not None: # Load dictionaries and ensure special tokens exist src_vocab = ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) dev_dictset = TextFile([valid_sent_dict], trg_vocab, None) #dev_stream = DataStream(dev_dataset) # Merge them to get a source, target pair dev_stream = Merge([ dev_dataset.get_example_stream(), dev_dictset.get_example_stream() ], ('source', 'valid_sent_trg_dict')) return dev_stream
def _get_align_stream(src_data, trg_data, src_vocab_size, trg_vocab_size, seq_len, **kwargs): """Creates the stream which is used for the main loop. Args: src_data (string): Path to the source sentences trg_data (string): Path to the target sentences src_vocab_size (int): Size of the source vocabulary in the NMT model trg_vocab_size (int): Size of the target vocabulary in the NMT model seq_len (int): Maximum length of any source or target sentence Returns: ExplicitNext. Alignment data stream which can be iterated explicitly """ # Build dummy vocabulary to make TextFile happy src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) s = Batch(s, iteration_scheme=ConstantScheme(1)) masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return ExplicitNext(masked_stream)
def get_dev_stream_with_topicalq(test_set=None, src_vocab=None, src_vocab_size=30000, topical_test_set=None, topical_vocab=None, topical_vocab_size=2000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if test_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) print test_set, type(src_vocab) topical_vocab = cPickle.load(open(topical_vocab, 'rb')) #not ensure special token. topical_dataset = TextFile([topical_test_set], topical_vocab, None, None, '10') dev_dataset = TextFile([test_set], src_vocab, None) #dev_stream = DataStream(dev_dataset) # Merge them to get a source, target pair dev_stream = Merge([ dev_dataset.get_example_stream(), topical_dataset.get_example_stream() ], ('source', 'source_topical')) return dev_stream
def test_text(): # Test word level and epochs. with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: sentences1 = f.name f.write("This is a sentence\n") f.write("This another one") with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: sentences2 = f.name f.write("More sentences\n") f.write("The last one") dictionary = {'<UNK>': 0, '</S>': 1, 'this': 2, 'a': 3, 'one': 4} text_data = TextFile(files=[sentences1, sentences2], dictionary=dictionary, bos_token=None, preprocess=lower) stream = DataStream(text_data) epoch = stream.get_epoch_iterator() assert len(list(epoch)) == 4 epoch = stream.get_epoch_iterator() for sentence in zip(range(3), epoch): pass f = BytesIO() cPickle.dump(epoch, f) sentence = next(epoch) f.seek(0) epoch = cPickle.load(f) assert next(epoch) == sentence assert_raises(StopIteration, next, epoch) # Test character level. dictionary = dict([(chr(ord('a') + i), i) for i in range(26)] + [(' ', 26)] + [('<S>', 27)] + [('</S>', 28)] + [('<UNK>', 29)]) text_data = TextFile(files=[sentences1, sentences2], dictionary=dictionary, preprocess=lower, level="character") sentence = next(DataStream(text_data).get_epoch_iterator())[0] assert sentence[:3] == [27, 19, 7] assert sentence[-3:] == [2, 4, 28]
def get_textfile_stream(source_file=None, src_vocab=None, src_vocab_size=30000, unk_id=1, bos_token=None): """Create a TextFile dataset from a single text file, and return a stream""" if type(bos_token) is str: bos_token = bos_token.decode('utf8') src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) source_dataset = TextFile([source_file], src_vocab, bos_token=bos_token, eos_token=u'</S>', unk_token=u'<UNK>', encoding='utf8') source_stream = source_dataset.get_example_stream() return source_stream
def get_sgnmt_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None: src_vocab = add_special_ids( {str(i): i for i in xrange(src_vocab_size)}) dev_dataset = TextFile([val_set], src_vocab, None) dev_stream = DataStream(dev_dataset) return dev_stream
def get_stream(input_file, vocab_file, **kwards): unk_token = kwards.pop('unk_token') eos_token = kwards.pop('eos_token') dataset = TextFile(files=[input_file], dictionary=pkl.load(open(vocab_file, 'rb')), level='word', unk_token=unk_token, bos_token=None, eos_token=eos_token) stream = DataStream(dataset) return stream
def get_dev_stream_withContext(val_ctx_datas=None, val_set_source=None, src_vocab=None, src_vocab_size=30000, unk_id=1, ctx_num=3, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) print val_set_source, type(src_vocab) # Get text files from both source and target ctx_datasets = [] for i in range(ctx_num): ctx_datasets.append(TextFile([val_ctx_datas[i]], src_vocab, None)) dev_dataset = TextFile([val_set_source], src_vocab, None) dev_stream = Merge([i.get_example_stream() for i in ctx_datasets] + [dev_dataset.get_example_stream()], tuple('context_' + str(i) for i in range(ctx_num)) + ('source', )) stream = Mapping( dev_stream, _oov_to_unk_dev(ctx_num=ctx_num, src_vocab_size=src_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(1)) masked_stream = PaddingWithEOSContext( stream, [src_vocab_size - 1 for i in range(ctx_num + 1)]) return masked_stream
def get_test_stream(src_vocab, trg_vocab, src_data, trg_data=None, src_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the test data stream (=no batches or gold labels).""" print('streaming...') # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=2, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=2, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, preprocess=get_unicode) print(src_data) #exit() trg_dataset = TextFile([trg_data], trg_vocab, preprocess=get_unicode) #stream=DataStream(src_dataset) stream = Merge([DataStream(src_dataset), DataStream(trg_dataset)], ('source', 'target')) return stream
def _get_sgnmt_dev_stream(val_set=None, src_vocab=None, src_vocab_size=30000, **kwargs): """Setup development set stream if necessary. The arguments to this method are given by the configuration dict. """ dev_stream = None if val_set is not None: src_vocab = _add_special_ids({str(i) : i for i in xrange(src_vocab_size)}) dev_dataset = TextFile([val_set], src_vocab, None) dev_stream = DataStream(dev_dataset) return dev_stream
def get_tst_stream(val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): tst_stream = None if val_set is not None and src_vocab is not None: # Load dictionaries and ensure special tokens exist src_vocab = ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) tst_dataset = TextFile([val_set], src_vocab, None) tst_stream = DataStream(tst_dataset) return tst_stream
def get_tr_stream_unsorted(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping( stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(1)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def get_sgnmt_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the unshuffled training data stream. This corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_dev_streams(config): """Setup development set stream if necessary.""" dev_streams = {} for cg in config['cgs']: if 'val_sets' in config and cg in config['val_sets']: logger.info('Building development stream for cg:[{}]'.format(cg)) eid = p_(cg)[0] dev_file = config['val_sets'][cg] # Get dictionary and fix EOS dictionary = cPickle.load(open(config['src_vocabs'][eid])) dictionary['<S>'] = 0 dictionary['<UNK>'] = config['unk_id'] dictionary['</S>'] = config['src_eos_idxs'][eid] # Get as a text file and convert it into a stream dev_dataset = TextFile([dev_file], dictionary, None) dev_streams[cg] = DataStream(dev_dataset) return dev_streams
def get_dev_stream_with_grdTruth(val_set_source=None, val_set_target=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, batch_size=128, unk_id=1, seq_len=50, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) print val_set_source, type(src_vocab) dev_dataset = TextFile([val_set_source], src_vocab, None) trg_dataset = TextFile([val_set_target], trg_vocab, None) # Merge them to get a source, target pair dev_stream = Merge([ dev_dataset.get_example_stream(), trg_dataset.get_example_stream() ], ('dev_source', 'dev_target')) # Filter sequences that are too long stream = Filter(dev_stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping( stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(1)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def get_log_prob_stream(cg, config): eid, did = p_(cg) dataset = config['log_prob_sets'][cg] # Prepare source vocabs and files, make sure special tokens are there src_vocab = cPickle.load(open(config['src_vocabs'][eid])) src_vocab['<S>'] = 0 src_vocab['</S>'] = config['src_eos_idxs'][eid] src_vocab['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_vocab = cPickle.load(open(config['trg_vocabs'][did])) trg_vocab['<S>'] = 0 trg_vocab['</S>'] = config['trg_eos_idxs'][did] trg_vocab['<UNK>'] = config['unk_id'] # Build the preprocessing pipeline for individual streams logger.info('Building logprob stream for cg:[{}]'.format(cg)) src_dataset = TextFile([dataset[0]], src_vocab, None) trg_dataset = TextFile([dataset[1]], trg_vocab, None) stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs, num_examples=get_num_lines( dataset[0]))) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) return masked_stream
def get_dev_stream_with_prefixes(val_set=None, val_set_grndtruth=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and val_set_grndtruth is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) dev_source_dataset = TextFile([val_set], src_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_stream = Merge([dev_source_dataset.get_example_stream(), dev_target_dataset.get_example_stream()], ('source', 'target')) # now add prefix and suffixes to this stream dev_stream = Mapping(dev_stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('dev_sample_ratio', 1.)), add_sources=('target_prefix', 'target_suffix')) dev_stream = Mapping(dev_stream, CopySourceAndTargetToMatchPrefixes(dev_stream)) # changing stream.produces_examples is a little hack which lets us use Unpack to flatten dev_stream.produces_examples = False # flatten the stream back out into (source, target, target_prefix, target_suffix) dev_stream = Unpack(dev_stream) if return_vocab: return dev_stream, src_vocab, trg_vocab else: return dev_stream
def get_dev_stream(sfiles, tfiles, svocab_dict, tvocab_dict): s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') # Merge stream = Merge([s_dataset.get_example_stream(), t_dataset.get_example_stream()], ('source', 'target')) # Batch - Sort stream = Batch(stream, iteration_scheme=ConstantScheme(1006)) # Pad # Note that </s>=0. Fuel only allows padding 0 by default masked_stream = Padding(stream) return masked_stream
def get_test_stream(test_set=None, src_vocab=None, trg_vocab=None, src_vocab_size=200000, trg_vocab_size=6540, unk_id=1, sort_k_batches=12): """Prepares the testing data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([test_set], src_vocab, None) trg_dataset = TextFile(['./data/test.zh'], trg_vocab, None) # Merge them to get a source, target pair stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk()) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(sort_k_batches)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(1)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def _get_text_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, **kwargs): """Creates a parallel data stream from two text files without random access. This stream cannot be used with reshuffling. The arguments to this method are given by the configuration dict. """ # Build dummy vocabulary to make TextFile happy src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair return Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target'))
class _too_long(object): def __init__(self, seq_len=50): self.seq_len = seq_len def __call__(self, sentence_pair): return all([len(sentence) <= self.seq_len for sentence in sentence_pair]) fi_vocab = config['src_vocab'] en_vocab = config['trg_vocab'] fi_file = config['src_data'] en_file = config['trg_data'] fi_dataset = TextFile([fi_file], cPickle.load(open(fi_vocab)), None) en_dataset = TextFile([en_file], cPickle.load(open(en_vocab)), None) stream = Merge([fi_dataset.get_example_stream(), en_dataset.get_example_stream()], ('source', 'target')) stream = Filter(stream, predicate=_too_long(config['seq_len'])) stream = Mapping(stream, _oov_to_unk( src_vocab_size=config['src_vocab_size'], trg_vocab_size=config['trg_vocab_size'], unk_id=config['unk_id'])) stream = Batch(stream, iteration_scheme=ConstantScheme( config['batch_size']*config['sort_k_batches']))
def main(config, tr_stream, dev_stream, use_bokeh=False): print("~def main") # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') print("~sampling_input = tensor.lmatrix") # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) print("~source_sentence_mask, target_sentence, target_sentence_mask") logger.info('Creating computational graph') cg = ComputationGraph(cost) print("~ComputationGraph") # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() print("~decoder.initialize()") # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) print("~cg = apply_dropout") # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff']) print("~cg = apply_noise") # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) print("~logger.info") # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) print("~training_model") # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] print("~every_n_batches=config") # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs sample = Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size']) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( sample ) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) sampling_fn = search_model.get_theano_function() print(" - - - - - - - - - - - - - - " ) sort_k_batches = 12 batch_size = 80 seq_len = 50 trg_ivocab = None src_vocab_size = config['src_vocab_size'] trg_vocab_size = config['trg_vocab_size'] unk_id = config['unk_id'] src_vocab = config['src_vocab'] trg_vocab = config['trg_vocab'] src_vocab = ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) if not trg_ivocab: trg_ivocab = {v: k for k, v in trg_vocab.items()} src_data = config['src_data'] trg_data = config['trg_data'] src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) inputstringfile="inputstringfile.cs" input_dataset = TextFile([inputstringfile], src_vocab, None) stream = Merge([input_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream2 = Filter(stream, predicate=_too_long(seq_len=seq_len)) stream3 = Mapping(stream2, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) stream4 = Batch(stream3, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) stream5 = Mapping(stream4, SortMapping(_length)) stream6 = Unpack(stream5) stream7 = Batch( stream6, iteration_scheme=ConstantScheme(batch_size)) input_stream = DataStream(input_dataset) print("dev_stream : ", type( dev_stream ) ) print("input_stream : ", type( input_stream ) ) epochone = input_stream.get_epoch_iterator() vocab = input_stream.dataset.dictionary unk_sym = input_stream.dataset.unk_token eos_sym = input_stream.dataset.eos_token for i, line in enumerate(epochone): seq = oov_to_unk( line[0], config['src_vocab_size'], unk_id) input_ = numpy.tile(seq, ( 1 , 1)) print("seq : " , type( seq ) , seq ) print("input_ : ", type( input_ ) , input_ , inspect.getmembers( input_ ) ) _1, outputs, _2, _3, costs = ( sampling_fn( input_ ) ) outputs = outputs.flatten() costs = costs.T print(" outputs : " , outputs , type( outputs ) ) print("idx_to_word: ", idx_to_word(outputs , trg_ivocab)) print(" - - - - - - - - - - - - - - " )
def main(mode, save_path, steps, num_batches, load_params): chars = (list(string.ascii_uppercase) + list(range(10)) + [' ', '.', ',', '\'', '"', '!', '?', '<UNK>']) char_to_ind = {char: i for i, char in enumerate(chars)} ind_to_char = {v: k for k, v in char_to_ind.iteritems()} train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'], char_to_ind, bos_token=None, eos_token=None, level='character') valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'], char_to_ind, bos_token=None, eos_token=None, level='character') vocab_size = len(char_to_ind) logger.info('Dictionary size: {}'.format(vocab_size)) if mode == 'continue': continue_training(save_path) return elif mode == "sample": main_loop = load(open(save_path, "rb")) generator = main_loop.model.get_top_bricks()[-1] sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] print("".join([ind_to_char[s] for s in outputs])) numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] return # Experiment configuration batch_size = 20 dim = 650 feedback_dim = 650 valid_stream = valid_dataset.get_example_stream() valid_stream = Batch(valid_stream, iteration_scheme=ConstantScheme(batch_size)) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=vocab_size, source_names=transition.apply.states, emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=Uniform(std=0.04), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() transition.push_initialization_config() generator.initialize() # Build the cost computation graph. features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') cost_matrix = generator.cost_matrix( features, mask=features_mask) batch_cost = cost_matrix.sum() cost = aggregation.mean( batch_cost, features.shape[1]) cost.name = "sequence_log_likelihood" char_cost = aggregation.mean( batch_cost, features_mask.sum()) char_cost.name = 'character_log_likelihood' ppl = 2 ** (cost / numpy.log(2)) ppl.name = 'ppl' bits_per_char = char_cost / tensor.log(2) bits_per_char.name = 'bits_per_char' length = features.shape[0] length.name = 'length' model = Model(batch_cost) if load_params: params = load_parameter_values(save_path) model.set_parameter_values(params) if mode == "train": # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_parameters().items()], width=120)) train_stream = train_dataset.get_example_stream() train_stream = Mapping(train_stream, _truncate) train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) train_stream = Mapping(train_stream, _transpose) parameters = model.get_parameter_dict() maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values()) algorithm = GradientDescent( cost=batch_cost, parameters=parameters.values(), step_rule=CompositeRule([StepClipping(1000.), AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects) ])) ft = features[:6, 0] ft.name = 'feature_example' observables = [cost, ppl, char_cost, length, bits_per_char] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) track_the_best_bpc = TrackTheBest('valid_bits_per_char') root_path, extension = os.path.splitext(save_path) this_step_monitoring = TrainingDataMonitoring( observables + [ft], prefix="this_step", after_batch=True) average_monitoring = TrainingDataMonitoring( observables + [algorithm.total_step_norm, algorithm.total_gradient_norm], prefix="average", every_n_batches=10) valid_monitoring = DataStreamMonitoring( observables, prefix="valid", every_n_batches=1500, before_training=False, data_stream=valid_stream) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, model=model, extensions=[ this_step_monitoring, average_monitoring, valid_monitoring, track_the_best_bpc, Checkpoint(save_path, ), Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_bpc.notification_name), (root_path + "_best" + extension,)), Timing(after_batch=True), Printing(every_n_batches=10), Plot(root_path, [[average_monitoring.record_name(cost), valid_monitoring.record_name(cost)], [average_monitoring.record_name(algorithm.total_step_norm)], [average_monitoring.record_name(algorithm.total_gradient_norm)], [average_monitoring.record_name(ppl), valid_monitoring.record_name(ppl)], [average_monitoring.record_name(char_cost), valid_monitoring.record_name(char_cost)], [average_monitoring.record_name(bits_per_char), valid_monitoring.record_name(bits_per_char)]], every_n_batches=10) ]) main_loop.run() elif mode == 'evaluate': with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f: raw_words = [line.split()[1:-1] for line in f.readlines()] words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] for w in raw_words] max_word_length = max([len(w) for w in words]) initial_states = tensor.matrix('init_states') cost_matrix_step = generator.cost_matrix(features, mask=features_mask, states=initial_states) cg = ComputationGraph(cost_matrix_step) states = cg.auxiliary_variables[-2] compute_cost = theano.function([features, features_mask, initial_states], [cost_matrix_step.sum(axis=0), states]) cost_matrix = generator.cost_matrix(features, mask=features_mask) initial_cg = ComputationGraph(cost_matrix) initial_states = initial_cg.auxiliary_variables[-2] total_word_cost = 0 num_words = 0 examples = numpy.zeros((max_word_length + 1, len(words)), dtype='int64') all_masks = numpy.zeros((max_word_length + 1, len(words)), dtype=floatX) for i, word in enumerate(words): examples[:len(word), i] = word all_masks[:len(word), i] = 1. single_space = numpy.array([char_to_ind[' ']])[:, None] for batch in valid_stream.get_epoch_iterator(): for example, mask in equizip(batch[0].T, batch[1].T): example = example[:(mask.sum())] spc_inds = list(numpy.where(example == char_to_ind[" "])[0]) state = generator.transition.transition.initial_states_.get_value()[None, :] for i, j in equizip([-1] + spc_inds, spc_inds + [-1]): word = example[(i+1):j, None] word_cost, states = compute_cost( word, numpy.ones_like(word, dtype=floatX), state) state = states[-1] costs = numpy.exp(-compute_cost( examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0]) _, space_states = compute_cost( single_space, numpy.ones_like(single_space, dtype=floatX), state) state = space_states[-1] word_prob = numpy.exp(-word_cost) total_word_cost += word_cost + numpy.log(numpy.sum(costs)) num_words += 1 print(word_prob) print(numpy.sum(costs)) print("Average cost", total_word_cost / num_words) print("PPL", numpy.exp(total_word_cost / num_words)) print("Word-level perplexity") print(total_word_cost / num_words) else: assert False
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = chars.shape[1].copy(name="batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations,) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = chars.shape[0].copy(name="max_length") cost_per_character = aggregation.mean( batch_cost, batch_size * max_length).copy( name="character_log_likelihood") min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") mean_activation = abs(activations).mean().copy( name="mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, parameter in parameters.items(): observables.append(parameter.norm(2).copy(name + "_norm")) observables.append(algorithm.gradients[parameter].norm(2).copy( name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_parameter_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( applications=[reverser.generator.generate], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: try: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) except EOFError: break except Exception: traceback.print_exc() continue encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = chars.shape[1].copy(name="batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations,) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = chars.shape[0].copy(name="max_length") cost_per_character = aggregation.mean(batch_cost, batch_size * max_length).copy( name="character_log_likelihood") min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") mean_activation = abs(activations).mean() .copy(name="mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, parameter in parameters.items(): observables.append( parameter.norm(2) .copy(name=name + "_norm")) observables.append( algorithm.gradients[parameter].norm(2) .copy(name=name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run()