def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict): s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') # Merge stream = Merge([s_dataset.get_example_stream(), t_dataset.get_example_stream()], ('source', 'target')) # Filter -- TODO stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len'])) # Map - no need # Batch - Sort stream = Batch(stream, iteration_scheme=ConstantScheme( configuration['batch_size']*configuration['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch( stream, iteration_scheme=ConstantScheme(configuration['batch_size'])) # Pad # Note that </s>=0. Fuel only allows padding 0 by default masked_stream = Padding(stream) return masked_stream
def _get_align_stream(src_data, trg_data, src_vocab_size, trg_vocab_size, seq_len, **kwargs): """Creates the stream which is used for the main loop. Args: src_data (string): Path to the source sentences trg_data (string): Path to the target sentences src_vocab_size (int): Size of the source vocabulary in the NMT model trg_vocab_size (int): Size of the target vocabulary in the NMT model seq_len (int): Maximum length of any source or target sentence Returns: ExplicitNext. Alignment data stream which can be iterated explicitly """ # Build dummy vocabulary to make TextFile happy src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) s = Batch(s, iteration_scheme=ConstantScheme(1)) masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return ExplicitNext(masked_stream)
def get_dev_stream_with_topicalq(test_set=None, src_vocab=None, src_vocab_size=30000, topical_test_set=None, topical_vocab=None, topical_vocab_size=2000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if test_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) print test_set, type(src_vocab) topical_vocab = cPickle.load(open(topical_vocab, 'rb')) #not ensure special token. topical_dataset = TextFile([topical_test_set], topical_vocab, None, None, '10') dev_dataset = TextFile([test_set], src_vocab, None) #dev_stream = DataStream(dev_dataset) # Merge them to get a source, target pair dev_stream = Merge([ dev_dataset.get_example_stream(), topical_dataset.get_example_stream() ], ('source', 'source_topical')) return dev_stream
def _get_align_stream(src_data, trg_data, src_vocab_size, trg_vocab_size, seq_len, **kwargs): """Creates the stream which is used for the main loop. Args: src_data (string): Path to the source sentences trg_data (string): Path to the target sentences src_vocab_size (int): Size of the source vocabulary in the NMT model trg_vocab_size (int): Size of the target vocabulary in the NMT model seq_len (int): Maximum length of any source or target sentence Returns: ExplicitNext. Alignment data stream which can be iterated explicitly """ # Build dummy vocabulary to make TextFile happy src_vocab = _add_special_ids({str(i) : i for i in xrange(src_vocab_size)}) trg_vocab = _add_special_ids({str(i) : i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) s = Batch(s, iteration_scheme=ConstantScheme(1)) masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return ExplicitNext(masked_stream)
def get_dev_stream(val_set=None, valid_sent_dict=None, src_vocab=None, trg_vocab=None, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and src_vocab is not None: # Load dictionaries and ensure special tokens exist src_vocab = ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) dev_dictset = TextFile([valid_sent_dict], trg_vocab, None) #dev_stream = DataStream(dev_dataset) # Merge them to get a source, target pair dev_stream = Merge([ dev_dataset.get_example_stream(), dev_dictset.get_example_stream() ], ('source', 'valid_sent_trg_dict')) return dev_stream
def get_logprob_streams(config): if 'log_prob_sets' not in config: return None cgs = config['cgs'] enc_ids, dec_ids = get_enc_dec_ids(cgs) datasets = config['log_prob_sets'] # Prepare source vocabs and files, make sure special tokens are there src_vocabs = {k: cPickle.load(open(v)) for k, v in config['src_vocabs'].iteritems()} for k in src_vocabs.keys(): src_vocabs[k]['<S>'] = 0 src_vocabs[k]['</S>'] = config['src_eos_idxs'][k] src_vocabs[k]['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_vocabs = {k: cPickle.load(open(v)) for k, v in config['trg_vocabs'].iteritems()} for k in trg_vocabs.keys(): trg_vocabs[k]['<S>'] = 0 trg_vocabs[k]['</S>'] = config['trg_eos_idxs'][k] trg_vocabs[k]['<UNK>'] = config['unk_id'] # Build the preprocessing pipeline for individual streams ind_streams = {} for cg in cgs: eid, did = p_(cg) if cg not in datasets: continue logger.info('Building logprob stream for cg:[{}]'.format(cg)) src_dataset = TextFile([datasets[cg][0]], src_vocabs[p_(cg)[0]], None) trg_dataset = TextFile([datasets[cg][1]], trg_vocabs[p_(cg)[1]], None) stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream = Mapping(stream, _oov_to_unk( src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs)) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx( [(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) ind_streams[cg] = masked_stream return ind_streams
def get_src_trg_stream(cg, config, src_datasets=None, trg_datasets=None, is_training=True, src_vocabs=None, trg_vocabs=None, logprob_datasets=None): eid, did = p_(cg) if is_training: logger.info(' ... src:[{}] - [{}]'.format( eid, src_datasets[cg].files[0])) logger.info(' ... trg:[{}] - [{}]'.format( did, trg_datasets[cg].files[0])) stream = Merge([src_datasets[cg].get_example_stream(), trg_datasets[cg].get_example_stream()], ('source', 'target')) stream = Filter(stream, predicate=_too_long(config['src_seq_len'], config['tgt_seq_len'])) if 'min_seq_lens' in config and config['min_seq_lens'][cg] > 0: stream = Filter(stream, predicate=_too_short(config['min_seq_lens'][cg])) stream = Mapping(stream, _oov_to_unk( src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) stream = Batch( stream, iteration_scheme=ConstantScheme( config['batch_sizes'][cg]*config['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme( config['batch_sizes'][cg])) else: # logprob stream src_dataset = TextFile([logprob_datasets[cg][0]], src_vocabs[p_(cg)[0]], None) trg_dataset = TextFile([logprob_datasets[cg][1]], trg_vocabs[p_(cg)[1]], None) stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream = Mapping(stream, _oov_to_unk( src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs)) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx( [(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) return masked_stream
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def get_sgnmt_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the unshuffled training data stream. This corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_test_stream_withContext_grdTruth(test_ctx_datas=None, test_set_source=None, test_set_target=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, batch_size=128, unk_id=1, ctx_num=3, **kwargs): """Setup development set stream if necessary.""" masked_stream = None if test_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) print test_set_source, type(src_vocab) # Get text files from both source and target ctx_datasets = [] for i in range(ctx_num): ctx_datasets.append(TextFile([test_ctx_datas[i]], src_vocab, None)) dev_dataset = TextFile([test_set_source], src_vocab, None) dev_target = TextFile([test_set_target], trg_vocab, None) dev_stream = Merge([i.get_example_stream() for i in ctx_datasets] + [ dev_dataset.get_example_stream(), dev_target.get_example_stream() ], tuple('context_' + str(i) for i in range(ctx_num)) + ('source', 'target')) stream = Mapping( dev_stream, _oov_to_unk(ctx_num=ctx_num, src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) masked_stream = PaddingWithEOSContext( stream, [src_vocab_size - 1 for i in range(ctx_num + 1)] + [trg_vocab_size - 1]) return masked_stream
def get_dev_stream_with_grdTruth(val_set_source=None, val_set_target=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, batch_size=128, unk_id=1, seq_len=50, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) print val_set_source, type(src_vocab) dev_dataset = TextFile([val_set_source], src_vocab, None) trg_dataset = TextFile([val_set_target], trg_vocab, None) # Merge them to get a source, target pair dev_stream = Merge([ dev_dataset.get_example_stream(), trg_dataset.get_example_stream() ], ('dev_source', 'dev_target')) # Filter sequences that are too long stream = Filter(dev_stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping( stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(1)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def get_dev_stream_with_context_features(val_context_features=None, val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" def _get_np_array(filename): return numpy.load(filename)['arr_0'] dev_stream = None if val_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) # now add the source with the image features # create the image datastream (iterate over a file line-by-line) con_features = _get_np_array(val_context_features) con_feature_dataset = IterableDataset(con_features) valid_image_stream = DataStream(con_feature_dataset) # dev_stream = DataStream(dev_dataset) dev_stream = Merge([dev_dataset.get_example_stream(), valid_image_stream], ('source', 'initial_context')) # dev_stream = dev_stream.get_example_stream() return dev_stream
def get_log_prob_stream(cg, config): eid, did = p_(cg) dataset = config['log_prob_sets'][cg] # Prepare source vocabs and files, make sure special tokens are there src_vocab = cPickle.load(open(config['src_vocabs'][eid])) src_vocab['<S>'] = 0 src_vocab['</S>'] = config['src_eos_idxs'][eid] src_vocab['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_vocab = cPickle.load(open(config['trg_vocabs'][did])) trg_vocab['<S>'] = 0 trg_vocab['</S>'] = config['trg_eos_idxs'][did] trg_vocab['<UNK>'] = config['unk_id'] # Build the preprocessing pipeline for individual streams logger.info('Building logprob stream for cg:[{}]'.format(cg)) src_dataset = TextFile([dataset[0]], src_vocab, None) trg_dataset = TextFile([dataset[1]], trg_vocab, None) stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs, num_examples=get_num_lines( dataset[0]))) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) return masked_stream
def get_dev_stream_with_prefix_file(val_set=None, val_set_grndtruth=None, val_set_prefixes=None, val_set_suffixes=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs): """Setup development stream with user-provided source, target, prefixes, and suffixes""" dev_stream = None if val_set is not None and val_set_grndtruth is not None and val_set_prefixes is not None and val_set_suffixes is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Note: user should have already provided the EOS token in the data representation for the suffix # Note: The reason that we need EOS tokens in the reference file is that IMT systems need to evaluate metrics # Note: which count prediction of the </S> token, and evaluation scripts are called on the files dev_source_dataset = TextFile([val_set], src_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_prefix_dataset = TextFile([val_set_prefixes], trg_vocab, bos_token='<S>', eos_token=None, unk_token='<UNK>') dev_suffix_dataset = TextFile([val_set_suffixes], trg_vocab, bos_token=None, eos_token=None, unk_token='<UNK>') dev_stream = Merge([dev_source_dataset.get_example_stream(), dev_target_dataset.get_example_stream(), dev_prefix_dataset.get_example_stream(), dev_suffix_dataset.get_example_stream()], ('source', 'target','target_prefix','target_suffix')) if return_vocab: return dev_stream, src_vocab, trg_vocab else: return dev_stream
def get_test_stream(sfiles, svocab_dict): dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') stream = Merge([dataset.get_example_stream(),], ('source', )) stream = Batch( stream, iteration_scheme=ConstantScheme(10)) stream = Padding(stream) return stream
def get_log_prob_stream(cg, config): eid, did = p_(cg) dataset = config['log_prob_sets'][cg] # Prepare source vocabs and files, make sure special tokens are there src_vocab = cPickle.load(open(config['src_vocabs'][eid])) src_vocab['<S>'] = 0 src_vocab['</S>'] = config['src_eos_idxs'][eid] src_vocab['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_vocab = cPickle.load(open(config['trg_vocabs'][did])) trg_vocab['<S>'] = 0 trg_vocab['</S>'] = config['trg_eos_idxs'][did] trg_vocab['<UNK>'] = config['unk_id'] # Build the preprocessing pipeline for individual streams logger.info('Building logprob stream for cg:[{}]'.format(cg)) src_dataset = TextFile([dataset[0]], src_vocab, None) trg_dataset = TextFile([dataset[1]], trg_vocab, None) stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream = Mapping(stream, _oov_to_unk( src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch( stream, iteration_scheme=ConstantScheme( bs, num_examples=get_num_lines(dataset[0]))) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx( [(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) return masked_stream
def get_sgnmt_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the unshuffled training data stream. This corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i) : i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i) : i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping(s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_dev_stream_with_prefixes(val_set=None, val_set_grndtruth=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and val_set_grndtruth is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) dev_source_dataset = TextFile([val_set], src_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_stream = Merge([dev_source_dataset.get_example_stream(), dev_target_dataset.get_example_stream()], ('source', 'target')) # now add prefix and suffixes to this stream dev_stream = Mapping(dev_stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('dev_sample_ratio', 1.)), add_sources=('target_prefix', 'target_suffix')) dev_stream = Mapping(dev_stream, CopySourceAndTargetToMatchPrefixes(dev_stream)) # changing stream.produces_examples is a little hack which lets us use Unpack to flatten dev_stream.produces_examples = False # flatten the stream back out into (source, target, target_prefix, target_suffix) dev_stream = Unpack(dev_stream) if return_vocab: return dev_stream, src_vocab, trg_vocab else: return dev_stream
def get_dev_tr_stream_with_topic_target(val_set_source=None,val_set_target=None, src_vocab=None,trg_vocab=None, src_vocab_size=30000,trg_vocab_size=30000, trg_topic_vocab_size=2000,source_topic_vocab_size=2000, topical_dev_set=None,topic_vocab_input=None,topic_vocab_output=None,topical_vocab_size=2000, unk_id=1, **kwargs): """Prepares the training data stream.""" dev_stream = None if val_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) topic_vocab_input=cPickle.load(open(topic_vocab_input,'rb')); topic_vocab_output=cPickle.load(open(topic_vocab_output, 'rb'));#already has <UNK> and </S> in it topic_binary_vocab={}; for k,v in topic_vocab_output.items(): if k=='<UNK>': topic_binary_vocab[k]=0; else: topic_binary_vocab[k]=1; # Get text files from both source and target src_dataset = TextFile([val_set_source], src_vocab, None) trg_dataset = TextFile([val_set_target], trg_vocab, None) src_topic_input=TextFile([topical_dev_set],topic_vocab_input,None,None,'rt') trg_topic_dataset = TextFile([val_set_target],topic_vocab_output,None); trg_topic_binary_dataset= TextFile([val_set_target],topic_binary_vocab,None); # Merge them to get a source, target pair dev_stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream(), src_topic_input.get_example_stream(), trg_topic_dataset.get_example_stream(), trg_topic_binary_dataset.get_example_stream()], ('source', 'target','source_topical','target_topic','target_binary_topic')) stream = Batch( dev_stream, iteration_scheme=ConstantScheme(1)) masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1,trg_vocab_size - 1, source_topic_vocab_size-1,trg_topic_vocab_size - 1,trg_topic_vocab_size-1]) return masked_stream
def get_dev_stream(sfiles, tfiles, svocab_dict, tvocab_dict): s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') # Merge stream = Merge([s_dataset.get_example_stream(), t_dataset.get_example_stream()], ('source', 'target')) # Batch - Sort stream = Batch(stream, iteration_scheme=ConstantScheme(1006)) # Pad # Note that </s>=0. Fuel only allows padding 0 by default masked_stream = Padding(stream) return masked_stream
def get_stream(vocab, data, vocab_size, unk_id, eos_id, bos_id, noise=0): vocab = get_vocab(vocab, vocab_size, unk_id, eos_id, bos_id) # Maps words to their index in the vocabulary. OOV words are replaced by <UNK> index. # Also appends </S> index at the end. No <S> token (TODO: bos_id parameter useless). dataset = TextFile([data], vocab, None) stream = Mapping(dataset.get_example_stream(), _add_noise(noise)) stream.dataset = dataset # for backward-compatibility return stream
def get_test_stream(test_set=None, src_vocab=None, trg_vocab=None, src_vocab_size=200000, trg_vocab_size=6540, unk_id=1, sort_k_batches=12): """Prepares the testing data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([test_set], src_vocab, None) trg_dataset = TextFile(['./data/test.zh'], trg_vocab, None) # Merge them to get a source, target pair stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk()) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(sort_k_batches)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(1)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def _get_text_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, **kwargs): """Creates a parallel data stream from two text files without random access. This stream cannot be used with reshuffling. The arguments to this method are given by the configuration dict. """ # Build dummy vocabulary to make TextFile happy src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair return Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target'))
def get_dev_stream_withContext_withPosTag(test_ctx_datas=None, test_posTag_datas=None, test_set_source=None, src_vocab=None, src_vocab_size=30000, unk_id=1, ctx_num=3, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if test_set_source is not None and src_vocab is not None: src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) print test_set_source, type(src_vocab) # Get text files from both source and target ctx_datasets = [] posTag_datasets = [] for i in range(ctx_num): ctx_datasets.append(TextFile([test_ctx_datas[i]], src_vocab, None)) posTag_datasets.append( TextFile([test_posTag_datas[i]], src_vocab, None)) posTag_datasets.append( TextFile([test_posTag_datas[ctx_num]], src_vocab, None)) src_dataset = TextFile([test_set_source], src_vocab, None) # Merge them to get a source, target pair dev_stream = Merge( [i.get_example_stream() for i in ctx_datasets] + [i.get_example_stream() for i in posTag_datasets] + [src_dataset.get_example_stream()], tuple('context_' + str(i) for i in range(ctx_num)) + tuple('context_posTag_' + str(i) for i in range(ctx_num)) + ('source_posTag', 'source')) stream = Mapping( dev_stream, _oov_to_unk_posTag_dev(ctx_num=ctx_num, src_vocab_size=src_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(1)) masked_stream = PaddingWithEOSContext( stream, [src_vocab_size - 1 for i in range(2 * ctx_num + 2)]) return masked_stream
def get_textfile_stream(source_file=None, src_vocab=None, src_vocab_size=30000, unk_id=1, bos_token=None): """Create a TextFile dataset from a single text file, and return a stream""" if type(bos_token) is str: bos_token = bos_token.decode('utf8') src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) source_dataset = TextFile([source_file], src_vocab, bos_token=bos_token, eos_token=u'</S>', unk_token=u'<UNK>', encoding='utf8') source_stream = source_dataset.get_example_stream() return source_stream
Globals.read_alphabet(f_vocab) #print("Vocab:",Globals.char2code, file=sys.stderr); Globals.read_lookup(f_train) m = MorphGen(100, len(Globals.char2code)) dataset_options = dict(dictionary=Globals.char2code, level="word", preprocess=_tokenise) dataset = TextFile([f_train], **dataset_options) data_stream = dataset.get_example_stream() # Read examples and look up the right surface form data_stream = Mapping(data_stream, morph_lookup, add_sources=("targets", )) # Read in 10 samples at a time data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) # Pad the examples data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialisation settings m.weights_init = IsotropicGaussian(0.1) m.biases_init = Constant(0.0)
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, dict_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist ''' actual_src_vocab_num = len(src_vocab) actual_trg_vocab_num = len(trg_vocab) src_vocab = ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=(actual_src_vocab_num - 1) if actual_src_vocab_num - 3 < src_vocab_size else (src_vocab_size + 3 - 1), unk_idx=unk_id) trg_vocab = ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=(actual_trg_vocab_num - 1) if actual_trg_vocab_num - 3 < trg_vocab_size else (trg_vocab_size + 3 - 1), unk_idx=unk_id) ''' src_vocab = ensure_special_tokens(src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = ensure_special_tokens(trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # for example: # source: 第五 章 罚则 # target: chapter v penalty regulations # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab) trg_dataset = TextFile([trg_data], trg_vocab) dict_dataset = TextFile([dict_data], trg_vocab) # for data in DataStream(src_dataset).get_epoch_iterator(): # print(data) # looks like: ([0, 1649, 1764, 7458, 29999],) # Merge them to get a source, target pair stream = Merge([ src_dataset.get_example_stream(), trg_dataset.get_example_stream(), dict_dataset.get_example_stream() ], ('source', 'target', 'dict')) # data_stream.sources = 'source' or 'target' ''' print 'init \n' num_before_filter = 0 for data in stream.get_epoch_iterator(): num_before_filter = num_before_filter + 1 # print(data) ''' # looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999]) # Filter sequences that are too long # Neither source sentence or target sentence can beyond the length seq_len # the lenght include the start symbol <s> and the end symbol </s>, so the actual sentence # length can not beyond (seq_len - 2) stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) ''' num_after_filter = 0 # print 'after filter ... \n' for data in stream.get_epoch_iterator(): num_after_filter = num_after_filter + 1 # print(data) logger.info('\tby filtering, sentence-pairs from {} to {}.'.format(num_before_filter, num_after_filter)) logger.info('\tfilter {} sentence-pairs whose source or target sentence exceeds {} words'.format( (num_before_filter - num_after_filter), seq_len)) ''' # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # do not need ''' print 'after mapping unk ...' for data in stream.get_epoch_iterator(): print(data) ''' # still looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999]) # Build a batched version of stream to read k batches ahead # do not sort on the whole training data, first split the training data into several blocks, # each block contain (batch_size*sort_k_batches) sentence-pairs, we juse sort in each block, # finally, i understand !!!!!!! # remainder stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) ''' print 'after sorted batch ... ' for data in stream.get_epoch_iterator(): print(data) ''' # Sort all samples in the read-ahead batch # sort by the length of target sentence in (batch_size*sort_k_batches) # list for all training data, speed up stream = Mapping(stream, SortMapping(_length)) ''' print 'after sort ... ' for data in stream.get_epoch_iterator(): print(data) ''' # Convert it into a stream again stream = Unpack(stream) ''' print 'after unpack ... ' for data in stream.get_epoch_iterator(): print(data) ''' # still looks like: ([0, 1649, 1764, 7458, 29999], [0, 2662, 9329, 968, 200, 29999]) # remove the remainder ? # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # after sort, each batch has batch_size sentence pairs ''' print 'after final batch ... ' i = 0 for data in stream.get_epoch_iterator(): i = i + 1 print(data) print 'batchs: ', i ''' # Pad sequences that are short masked_stream = PaddingWithEOS( stream, bos_idx=[0, 0, 0], eos_idx=[src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1]) # print 'after padding with mask ...' return masked_stream
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, bos_token=None, **kwargs): """Prepares the training data stream.""" if type(bos_token) is str: bos_token = bos_token.decode('utf8') # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens(src_vocab if isinstance( src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens(trg_vocab if isinstance( trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, bos_token=bos_token, eos_token=u'</S>', unk_token=u'<UNK>', encoding='utf8') trg_dataset = TextFile([trg_data], trg_vocab, bos_token=bos_token, eos_token=u'</S>', unk_token=u'<UNK>', encoding='utf8') # Merge them to get a source, target pair stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? stream = Mapping( stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Now make a very big batch that we can shuffle shuffle_batch_size = kwargs.get('shuffle_batch_size', 1000) stream = Batch(stream, iteration_scheme=ConstantScheme(shuffle_batch_size)) stream = ShuffleBatchTransformer(stream) # unpack it again stream = Unpack(stream) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS(stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream, src_vocab, trg_vocab
def get_logprob_streams(config): if 'log_prob_sets' not in config: return None cgs = config['cgs'] enc_ids, dec_ids = get_enc_dec_ids(cgs) datasets = config['log_prob_sets'] # Prepare source vocabs and files, make sure special tokens are there src_vocabs = { k: cPickle.load(open(v)) for k, v in config['src_vocabs'].iteritems() } for k in src_vocabs.keys(): src_vocabs[k]['<S>'] = 0 src_vocabs[k]['</S>'] = config['src_eos_idxs'][k] src_vocabs[k]['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_vocabs = { k: cPickle.load(open(v)) for k, v in config['trg_vocabs'].iteritems() } for k in trg_vocabs.keys(): trg_vocabs[k]['<S>'] = 0 trg_vocabs[k]['</S>'] = config['trg_eos_idxs'][k] trg_vocabs[k]['<UNK>'] = config['unk_id'] # Build the preprocessing pipeline for individual streams ind_streams = {} for cg in cgs: eid, did = p_(cg) if cg not in datasets: continue logger.info('Building logprob stream for cg:[{}]'.format(cg)) src_dataset = TextFile([datasets[cg][0]], src_vocabs[p_(cg)[0]], None) trg_dataset = TextFile([datasets[cg][1]], trg_vocabs[p_(cg)[1]], None) stream = Merge([ src_dataset.get_example_stream(), trg_dataset.get_example_stream() ], ('source', 'target')) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs)) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) ind_streams[cg] = masked_stream return ind_streams
def get_tr_stream_with_context_features(src_vocab, trg_vocab, src_data, trg_data, context_features, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" def _get_np_array(filename): return numpy.load(filename)['arr_0'] # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # now add the source with the image features # create the image datastream (iterate over a file line-by-line) train_features = _get_np_array(context_features) train_feature_dataset = IterableDataset(train_features) train_image_stream = DataStream(train_feature_dataset) stream = Merge([stream, train_image_stream], ('source', 'target', 'initial_context')) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1], mask_sources=('source', 'target')) return masked_stream, src_vocab, trg_vocab
def get_tr_stream_with_prefixes(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the IMT training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # TODO: should training stream actually have begin and end tokens? # Note: this actually depends upon how the system was pre-trained, but systems used for initialization # Note: should _always_ have BOS tokens # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') trg_dataset = TextFile([trg_data], trg_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) stream = Mapping(stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('train_sample_ratio', 1.)), add_sources=('target_prefix', 'target_suffix')) stream = Mapping(stream, CopySourceAndTargetToMatchPrefixes(stream)) # changing stream.produces_examples is a little hack which lets us use Unpack to flatten stream.produces_examples = False # flatten the stream back out into (source, target, target_prefix, target_suffix) stream = Unpack(stream) # Now make a very big batch that we can shuffle shuffle_batch_size = kwargs['shuffle_batch_size'] stream = Batch(stream, iteration_scheme=ConstantScheme(shuffle_batch_size) ) stream = ShuffleBatchTransformer(stream) # unpack it again stream = Unpack(stream) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches) ) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short # TODO: is it correct to blindly pad the target_prefix and the target_suffix? configurable_padding_args = { 'suffix_length': kwargs.get('suffix_length', None), 'truncate_sources': kwargs.get('truncate_sources', []) } logger.info('Training suffix length is: {}'.format(configurable_padding_args['suffix_length'])) logger.info('I will mask the following sources after <suffix_length>: {}'.format(configurable_padding_args['truncate_sources'])) masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1], mask_sources=('source', 'target', 'target_prefix', 'target_suffix'), **configurable_padding_args) return masked_stream, src_vocab, trg_vocab
def get_tr_stream_with_topic_target(src_vocab, trg_vocab,topic_vocab_input,topic_vocab_output, src_data, trg_data,topical_data, src_vocab_size=30000, trg_vocab_size=30000,trg_topic_vocab_size=2000,source_topic_vocab_size=2000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) topic_vocab_input=cPickle.load(open(topic_vocab_input,'rb')); topic_vocab_output=cPickle.load(open(topic_vocab_output, 'rb'));#already has <UNK> and </S> in it topic_binary_vocab={}; for k,v in topic_vocab_output.items(): if k=='<UNK>': topic_binary_vocab[k]=0; else: topic_binary_vocab[k]=1; # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) src_topic_input=TextFile([topical_data],topic_vocab_input,None,None,'rt') trg_topic_dataset = TextFile([trg_data],topic_vocab_output,None); trg_topic_binary_dataset= TextFile([trg_data],topic_binary_vocab,None); # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream(), src_topic_input.get_example_stream(), trg_topic_dataset.get_example_stream(), trg_topic_binary_dataset.get_example_stream()], ('source', 'target','source_topical','target_topic','target_binary_topic')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # The topical part are not contained of it, check~ stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, src_topic_vocab_size=source_topic_vocab_size, trg_topic_vocab_size=trg_topic_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1,trg_vocab_size - 1, source_topic_vocab_size-1,trg_topic_vocab_size - 1,trg_topic_vocab_size-1]) return masked_stream
def main(config, tr_stream, dev_stream, use_bokeh=False): print("~def main") # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') print("~sampling_input = tensor.lmatrix") # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) print("~source_sentence_mask, target_sentence, target_sentence_mask") logger.info('Creating computational graph') cg = ComputationGraph(cost) print("~ComputationGraph") # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() print("~decoder.initialize()") # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) print("~cg = apply_dropout") # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff']) print("~cg = apply_noise") # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) print("~logger.info") # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) print("~training_model") # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] print("~every_n_batches=config") # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs sample = Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size']) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( sample ) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) sampling_fn = search_model.get_theano_function() print(" - - - - - - - - - - - - - - " ) sort_k_batches = 12 batch_size = 80 seq_len = 50 trg_ivocab = None src_vocab_size = config['src_vocab_size'] trg_vocab_size = config['trg_vocab_size'] unk_id = config['unk_id'] src_vocab = config['src_vocab'] trg_vocab = config['trg_vocab'] src_vocab = ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) if not trg_ivocab: trg_ivocab = {v: k for k, v in trg_vocab.items()} src_data = config['src_data'] trg_data = config['trg_data'] src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) inputstringfile="inputstringfile.cs" input_dataset = TextFile([inputstringfile], src_vocab, None) stream = Merge([input_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream2 = Filter(stream, predicate=_too_long(seq_len=seq_len)) stream3 = Mapping(stream2, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) stream4 = Batch(stream3, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) stream5 = Mapping(stream4, SortMapping(_length)) stream6 = Unpack(stream5) stream7 = Batch( stream6, iteration_scheme=ConstantScheme(batch_size)) input_stream = DataStream(input_dataset) print("dev_stream : ", type( dev_stream ) ) print("input_stream : ", type( input_stream ) ) epochone = input_stream.get_epoch_iterator() vocab = input_stream.dataset.dictionary unk_sym = input_stream.dataset.unk_token eos_sym = input_stream.dataset.eos_token for i, line in enumerate(epochone): seq = oov_to_unk( line[0], config['src_vocab_size'], unk_id) input_ = numpy.tile(seq, ( 1 , 1)) print("seq : " , type( seq ) , seq ) print("input_ : ", type( input_ ) , input_ , inspect.getmembers( input_ ) ) _1, outputs, _2, _3, costs = ( sampling_fn( input_ ) ) outputs = outputs.flatten() costs = costs.T print(" outputs : " , outputs , type( outputs ) ) print("idx_to_word: ", idx_to_word(outputs , trg_ivocab)) print(" - - - - - - - - - - - - - - " )
def __init__(self, seq_len=50): self.seq_len = seq_len def __call__(self, sentence_pair): return all([len(sentence) <= self.seq_len for sentence in sentence_pair]) fi_vocab = config['src_vocab'] en_vocab = config['trg_vocab'] fi_file = config['src_data'] en_file = config['trg_data'] fi_dataset = TextFile([fi_file], cPickle.load(open(fi_vocab)), None) en_dataset = TextFile([en_file], cPickle.load(open(en_vocab)), None) stream = Merge([fi_dataset.get_example_stream(), en_dataset.get_example_stream()], ('source', 'target')) stream = Filter(stream, predicate=_too_long(config['seq_len'])) stream = Mapping(stream, _oov_to_unk( src_vocab_size=config['src_vocab_size'], trg_vocab_size=config['trg_vocab_size'], unk_id=config['unk_id'])) stream = Batch(stream, iteration_scheme=ConstantScheme( config['batch_size']*config['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config['batch_size']))
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weghts_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( application=generator.readout.readout, name="output")(cg.variables) (activations,) = VariableFilter( application=generator.transition.apply, name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy( param.norm(2), name + "_norm")) observables.append(named_copy( algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_param_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def __call__(self, sentence_pair): return all( [len(sentence) <= self.seq_len for sentence in sentence_pair]) fi_vocab = config['src_vocab'] en_vocab = config['trg_vocab'] fi_file = config['src_data'] en_file = config['trg_data'] fi_dataset = TextFile([fi_file], cPickle.load(open(fi_vocab)), None) en_dataset = TextFile([en_file], cPickle.load(open(en_vocab)), None) stream = Merge( [fi_dataset.get_example_stream(), en_dataset.get_example_stream()], ('source', 'target')) stream = Filter(stream, predicate=_too_long(config['seq_len'])) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_size'], trg_vocab_size=config['trg_vocab_size'], unk_id=config['unk_id'])) stream = Batch(stream, iteration_scheme=ConstantScheme(config['batch_size'] * config['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config['batch_size']))
def get_src_trg_stream(cg, config, src_datasets=None, trg_datasets=None, is_training=True, src_vocabs=None, trg_vocabs=None, logprob_datasets=None): eid, did = p_(cg) if is_training: logger.info(' ... src:[{}] - [{}]'.format(eid, src_datasets[cg].files[0])) logger.info(' ... trg:[{}] - [{}]'.format(did, trg_datasets[cg].files[0])) stream = Merge([ src_datasets[cg].get_example_stream(), trg_datasets[cg].get_example_stream() ], ('source', 'target')) stream = Filter(stream, predicate=_too_long(config['seq_len'])) if 'min_seq_lens' in config and config['min_seq_lens'][cg] > 0: stream = Filter(stream, predicate=_too_short(config['min_seq_lens'][cg])) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) stream = Batch( stream, iteration_scheme=ConstantScheme(config['batch_sizes'][cg] * config['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme( config['batch_sizes'][cg])) else: # logprob stream src_dataset = TextFile([logprob_datasets[cg][0]], src_vocabs[p_(cg)[0]], None) trg_dataset = TextFile([logprob_datasets[cg][1]], trg_vocabs[p_(cg)[1]], None) stream = Merge([ src_dataset.get_example_stream(), trg_dataset.get_example_stream() ], ('source', 'target')) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs)) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) return masked_stream
def main(mode, save_path, steps, num_batches, load_params): chars = (list(string.ascii_uppercase) + list(range(10)) + [' ', '.', ',', '\'', '"', '!', '?', '<UNK>']) char_to_ind = {char: i for i, char in enumerate(chars)} ind_to_char = {v: k for k, v in char_to_ind.iteritems()} train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'], char_to_ind, bos_token=None, eos_token=None, level='character') valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'], char_to_ind, bos_token=None, eos_token=None, level='character') vocab_size = len(char_to_ind) logger.info('Dictionary size: {}'.format(vocab_size)) if mode == 'continue': continue_training(save_path) return elif mode == "sample": main_loop = load(open(save_path, "rb")) generator = main_loop.model.get_top_bricks()[-1] sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] print("".join([ind_to_char[s] for s in outputs])) numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] return # Experiment configuration batch_size = 20 dim = 650 feedback_dim = 650 valid_stream = valid_dataset.get_example_stream() valid_stream = Batch(valid_stream, iteration_scheme=ConstantScheme(batch_size)) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=vocab_size, source_names=transition.apply.states, emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=Uniform(std=0.04), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() transition.push_initialization_config() generator.initialize() # Build the cost computation graph. features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') cost_matrix = generator.cost_matrix( features, mask=features_mask) batch_cost = cost_matrix.sum() cost = aggregation.mean( batch_cost, features.shape[1]) cost.name = "sequence_log_likelihood" char_cost = aggregation.mean( batch_cost, features_mask.sum()) char_cost.name = 'character_log_likelihood' ppl = 2 ** (cost / numpy.log(2)) ppl.name = 'ppl' bits_per_char = char_cost / tensor.log(2) bits_per_char.name = 'bits_per_char' length = features.shape[0] length.name = 'length' model = Model(batch_cost) if load_params: params = load_parameter_values(save_path) model.set_parameter_values(params) if mode == "train": # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_parameters().items()], width=120)) train_stream = train_dataset.get_example_stream() train_stream = Mapping(train_stream, _truncate) train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) train_stream = Mapping(train_stream, _transpose) parameters = model.get_parameter_dict() maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values()) algorithm = GradientDescent( cost=batch_cost, parameters=parameters.values(), step_rule=CompositeRule([StepClipping(1000.), AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects) ])) ft = features[:6, 0] ft.name = 'feature_example' observables = [cost, ppl, char_cost, length, bits_per_char] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) track_the_best_bpc = TrackTheBest('valid_bits_per_char') root_path, extension = os.path.splitext(save_path) this_step_monitoring = TrainingDataMonitoring( observables + [ft], prefix="this_step", after_batch=True) average_monitoring = TrainingDataMonitoring( observables + [algorithm.total_step_norm, algorithm.total_gradient_norm], prefix="average", every_n_batches=10) valid_monitoring = DataStreamMonitoring( observables, prefix="valid", every_n_batches=1500, before_training=False, data_stream=valid_stream) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, model=model, extensions=[ this_step_monitoring, average_monitoring, valid_monitoring, track_the_best_bpc, Checkpoint(save_path, ), Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_bpc.notification_name), (root_path + "_best" + extension,)), Timing(after_batch=True), Printing(every_n_batches=10), Plot(root_path, [[average_monitoring.record_name(cost), valid_monitoring.record_name(cost)], [average_monitoring.record_name(algorithm.total_step_norm)], [average_monitoring.record_name(algorithm.total_gradient_norm)], [average_monitoring.record_name(ppl), valid_monitoring.record_name(ppl)], [average_monitoring.record_name(char_cost), valid_monitoring.record_name(char_cost)], [average_monitoring.record_name(bits_per_char), valid_monitoring.record_name(bits_per_char)]], every_n_batches=10) ]) main_loop.run() elif mode == 'evaluate': with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f: raw_words = [line.split()[1:-1] for line in f.readlines()] words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] for w in raw_words] max_word_length = max([len(w) for w in words]) initial_states = tensor.matrix('init_states') cost_matrix_step = generator.cost_matrix(features, mask=features_mask, states=initial_states) cg = ComputationGraph(cost_matrix_step) states = cg.auxiliary_variables[-2] compute_cost = theano.function([features, features_mask, initial_states], [cost_matrix_step.sum(axis=0), states]) cost_matrix = generator.cost_matrix(features, mask=features_mask) initial_cg = ComputationGraph(cost_matrix) initial_states = initial_cg.auxiliary_variables[-2] total_word_cost = 0 num_words = 0 examples = numpy.zeros((max_word_length + 1, len(words)), dtype='int64') all_masks = numpy.zeros((max_word_length + 1, len(words)), dtype=floatX) for i, word in enumerate(words): examples[:len(word), i] = word all_masks[:len(word), i] = 1. single_space = numpy.array([char_to_ind[' ']])[:, None] for batch in valid_stream.get_epoch_iterator(): for example, mask in equizip(batch[0].T, batch[1].T): example = example[:(mask.sum())] spc_inds = list(numpy.where(example == char_to_ind[" "])[0]) state = generator.transition.transition.initial_states_.get_value()[None, :] for i, j in equizip([-1] + spc_inds, spc_inds + [-1]): word = example[(i+1):j, None] word_cost, states = compute_cost( word, numpy.ones_like(word, dtype=floatX), state) state = states[-1] costs = numpy.exp(-compute_cost( examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0]) _, space_states = compute_cost( single_space, numpy.ones_like(single_space, dtype=floatX), state) state = space_states[-1] word_prob = numpy.exp(-word_cost) total_word_cost += word_cost + numpy.log(numpy.sum(costs)) num_words += 1 print(word_prob) print(numpy.sum(costs)) print("Average cost", total_word_cost / num_words) print("PPL", numpy.exp(total_word_cost / num_words)) print("Word-level perplexity") print(total_word_cost / num_words) else: assert False
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = chars.shape[1].copy(name="batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations,) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = chars.shape[0].copy(name="max_length") cost_per_character = aggregation.mean( batch_cost, batch_size * max_length).copy( name="character_log_likelihood") min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") mean_activation = abs(activations).mean().copy( name="mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, parameter in parameters.items(): observables.append(parameter.norm(2).copy(name + "_norm")) observables.append(algorithm.gradients[parameter].norm(2).copy( name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_parameter_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( applications=[reverser.generator.generate], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: try: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) except EOFError: break except Exception: traceback.print_exc() continue encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = chars.shape[1].copy(name="batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations,) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = chars.shape[0].copy(name="max_length") cost_per_character = aggregation.mean(batch_cost, batch_size * max_length).copy( name="character_log_likelihood") min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") mean_activation = abs(activations).mean() .copy(name="mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, parameter in parameters.items(): observables.append( parameter.norm(2) .copy(name=name + "_norm")) observables.append( algorithm.gradients[parameter].norm(2) .copy(name=name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run()