def main(): args = parse_args() state = getattr(experiments.nmt, args.proto)() if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, args.skip_init) enc_dec.build() lm_model = enc_dec.create_lm_model() logger.debug("Load data") train_data = get_batch_iterator(state) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main()
def main(): args = parse_args() state = getattr(experiments.nmt, args.proto)() if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, skip_init=args.skip_init, compute_alignment=True) enc_dec.build() lm_model = enc_dec.create_lm_model() logger.debug("Load data") train_data = get_batch_iterator(state) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main()
def main(): args = parse_args() print 'syscomb' state = getattr(experiments.nmt, args.proto)() if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state['seed']) if state['syscomb']: enc_dec = SystemCombination(state, rng, args.skip_init) else: enc_dec = RNNEncoderDecoder(state, rng, args.skip_init) enc_dec.build() if state['algo'] == 'SGD_mrt': train_sampler = enc_dec.create_sampler(many_samples=True) lm_model = enc_dec.create_lm_model() print 'lm model inputs:', lm_model.inputs logger.debug("Load data") if state['syscomb']: train_data = get_batch_iterator_multi(state) sampler = RandomSamplePrinter_multi(state, lm_model, train_data, enc_dec) else: train_data = get_batch_iterator(state) sampler = RandomSamplePrinter(state, lm_model, train_data) logger.debug("Compile trainer") if state['algo'] == 'SGD_mrt': algo = eval(state['algo'])(lm_model, state, train_data, train_sampler) else: algo = eval(state['algo'])(lm_model, state, train_data) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], hooks=[sampler] if state['hookFreq'] >= 0 else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main()
def main(): args = parse_args() # this loads the state specified in the prototype state = getattr(experiments.nmt, args.proto)() # this is based on the suggestion in the README.md in this foloder if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, args.skip_init) enc_dec.build() lm_model = enc_dec.create_lm_model() # If we are going to use validation with the bleu script, we # will need early stopping bleu_validator = None if state['bleu_script'] is not None and state['validation_set'] is not None\ and state['validation_set_grndtruth'] is not None: # make beam search beam_search = BeamSearch(enc_dec) beam_search.compile() bleu_validator = BleuValidator(state, lm_model, beam_search, verbose=state['output_validation_set']) logger.debug("Load data") train_data = get_batch_iterator(state) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], bleu_val_fn = bleu_validator, hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 and state['validation_set'] is not None else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main()
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of language modeling with RNN", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument( "prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument( "state", nargs="?", default="", help="Changes to Groundhog state") parser.add_argument("--path", help="Path to a language dataset") parser.add_argument("--dict", help="Path to the dataset dictionary") parser.add_argument("--restart", help="Start anew") parser.add_argument( "--reset", action="store_true", default=False, help="Reset the hidden state between batches") parser.add_argument( "--steps", type=int, default=100, help="Number of steps to plot for the 'sample' mode" " OR training sequence length for the 'train' mode.") args = parser.parse_args() logger.debug("Args:\n" + str(args)) dim = 200 num_chars = 50 transition = GatedRecurrent( name="transition", activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator( LinearReadout(readout_dim=num_chars, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( num_chars, dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) if args.mode == "train": batch_size = 1 seq_len = args.steps generator.initialize() # Build cost computation graph that uses the saved hidden states. # An issue: for Groundhog this is completely transparent, that's # why it does not carry the hidden state over the period when # validation in done. We should find a way to fix in the future. x = tensor.lmatrix('x') init_states = shared_floatx_zeros((batch_size, dim), name='init_states') reset = tensor.scalar('reset') cost = ComputationGraph( generator.cost(x, states=init_states * reset).sum()) # TODO: better search routine states = [v for v in cost.variables if hasattr(v.tag, 'application_call') and v.tag.application_call.brick == generator.transition and (v.tag.application_call.application == generator.transition.apply) and v.tag.role == VariableRole.OUTPUT and v.tag.name == 'states'] assert len(states) == 1 states = states[0] gh_model = GroundhogModel(generator, cost) gh_model.properties.append( ('bpc', cost.outputs[0] * numpy.log(2) / seq_len)) gh_model.properties.append(('mean_init_state', init_states.mean())) gh_model.properties.append(('reset', reset)) if not args.reset: gh_model.updates.append((init_states, states[-1])) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() changes = eval("dict({})".format(args.state)) state.update(changes) def output_format(x, y, reset): return dict(x=x[:, None], reset=reset) train, valid, test = [ LMIterator(batch_size=batch_size, use_infinite_loop=mode == 'train', path=args.path, seq_len=seq_len, mode=mode, chunks='chars', output_format=output_format, can_fit=True) for mode in ['train', 'valid', 'test']] trainer = SGD(gh_model, state, train) state['on_nan'] = 'warn' state['cutoff'] = 1. main_loop = MainLoop(train, valid, None, gh_model, trainer, state, None) if not args.restart: main_loop.load() main_loop.main() elif args.mode == "sample": load_params(generator, args.prefix + "model.npz") chars = numpy.load(args.dict)['unique_chars'] sample = ComputationGraph(generator.generate( n_steps=args.steps, batch_size=10, iterate=True)).function() states, outputs, costs = sample() for i in range(10): print("Generation cost: {}".format(costs[:, i].sum())) print("".join([chars[o] for o in outputs[:, i]])) else: assert False
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating a Markov chain with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument( "prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument( "--steps", type=int, default=100, help="Number of steps to plot") args = parser.parse_args() dim = 10 num_states = ChainIterator.num_states feedback_dim = 8 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator( LinearReadout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) if args.mode == "train": rng = numpy.random.RandomState(1) batch_size = 50 generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.debug("transition.weights_init={}".format( transition.weights_init)) cost = generator.cost(tensor.lmatrix('x')).sum() gh_model = GroundhogModel(generator, cost) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() data = ChainIterator(rng, 100, batch_size) trainer = SGD(gh_model, state, data) main_loop = MainLoop(data, None, None, gh_model, trainer, state, None) main_loop.main() elif args.mode == "sample": load_params(generator, args.prefix + "model.npz") sample = ComputationGraph(generator.generate( n_steps=args.steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainIterator.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainIterator.trans_prob)) else: assert False
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating a Markov chain with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument("prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument("--steps", type=int, default=100, help="Number of steps to plot") args = parser.parse_args() dim = 10 num_states = ChainIterator.num_states feedback_dim = 8 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator(LinearReadout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) if args.mode == "train": rng = numpy.random.RandomState(1) batch_size = 50 generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.debug("transition.weights_init={}".format( transition.weights_init)) cost = generator.cost(tensor.lmatrix('x')).sum() gh_model = GroundhogModel(generator, cost) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() data = ChainIterator(rng, 100, batch_size) trainer = SGD(gh_model, state, data) main_loop = MainLoop(data, None, None, gh_model, trainer, state, None) main_loop.main() elif args.mode == "sample": load_params(generator, args.prefix + "model.npz") sample = ComputationGraph( generator.generate(n_steps=args.steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainIterator.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainIterator.trans_prob)) else: assert False
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating simple 1d sequences with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "plot"], help="The mode to run. Use `train` to train a new model" " and `plot` to plot a sequence generated by an" " existing one.") parser.add_argument("prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument("--input-noise", type=float, default=0.0, help="Adds Gaussian noise of given intensity to the " " training sequences.") parser.add_argument( "--function", default="lambda a, x: numpy.sin(a * x)", help="An analytical description of the sequence family to learn." " The arguments before the last one are considered parameters.") parser.add_argument("--steps", type=int, default=100, help="Number of steps to plot") parser.add_argument("--params", help="Parameter values for plotting") args = parser.parse_args() function = eval(args.function) num_params = len(inspect.getargspec(function).args) - 1 class Emitter(TrivialEmitter): @application def cost(self, readouts, outputs): """Compute MSE.""" return ((readouts - outputs)**2).sum(axis=readouts.ndim - 1) transition = GatedRecurrent(name="transition", activation=Tanh(), dim=10, weights_init=Orthogonal()) with_params = AddParameters(transition, num_params, "params", name="with_params") generator = SequenceGenerator(LinearReadout( readout_dim=1, source_names=["states"], emitter=Emitter(name="emitter"), name="readout"), with_params, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat(Selector(generator).get_params().keys())) if args.mode == "train": seed = 1 rng = numpy.random.RandomState(seed) batch_size = 10 generator.initialize() cost = Cost( generator.cost(tensor.tensor3('x'), params=tensor.matrix("params")).sum()) if args.input_noise: cost.apply_noise(cost.inputs, args.input_noise) gh_model = GroundhogModel(generator, cost) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() data = SeriesIterator(rng, function, 100, batch_size) trainer = SGD(gh_model, state, data) main_loop = MainLoop(data, None, None, gh_model, trainer, state, None) main_loop.load() main_loop.main() elif args.mode == "plot": load_params(generator, args.prefix + "model.npz") params = tensor.matrix("params") sample = theano.function([params], generator.generate(params=params, n_steps=args.steps, batch_size=1)) param_values = numpy.array(map(float, args.params.split()), dtype=floatX) states, outputs, _ = sample(param_values[None, :]) actual = outputs[:, 0, 0] desired = numpy.array( [function(*(list(param_values) + [T])) for T in range(args.steps)]) print("MSE: {}".format(((actual - desired)**2).sum())) pyplot.plot(numpy.hstack([actual[:, None], desired[:, None]])) pyplot.show() else: assert False
def main(): args = parse_args() # this loads the state specified in the prototype state = getattr(experiments.nmt, args.proto)() # this is based on the suggestion in the README.md in this foloder if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, args.skip_init) enc_dec.build() lm_model = enc_dec.create_lm_model() # If we are going to use validation with the bleu script, we # will need early stopping bleu_validator = None if state['bleu_script'] is not None and state['validation_set'] is not None\ and state['validation_set_grndtruth'] is not None: # make beam search beam_search = BeamSearch(enc_dec) beam_search.compile() bleu_validator = BleuValidator(state, lm_model, beam_search, verbose=state['output_validation_set']) logger.debug("Load data") train_data = get_batch_iterator(state) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], bleu_val_fn=bleu_validator, hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 and state['validation_set'] is not None else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main()
def jobman(state, channel): # load dataset rng = numpy.random.RandomState(state['seed']) # declare the dimensionalies of the input and output if state['chunks'] == 'words': state['n_in'] = 10000 state['n_out'] = 10000 else: state['n_in'] = 50 state['n_out'] = 50 train_data, valid_data, test_data = get_text_data(state) ## BEGIN Tutorial ### Define Theano Input Variables x = TT.lvector('x') y = TT.lvector('y') h0 = theano.shared( numpy.zeros((eval(state['nhids'])[-1], ), dtype='float32')) ### Neural Implementation of the Operators: \oplus #### Word Embedding emb_words = MultiLayer(rng, n_in=state['n_in'], n_hids=eval(state['inp_nhids']), activation=eval(state['inp_activ']), init_fn='sample_weights_classic', weight_noise=state['weight_noise'], rank_n_approx=state['rank_n_approx'], scale=state['inp_scale'], sparsity=state['inp_sparse'], learn_bias=True, bias_scale=eval(state['inp_bias']), name='emb_words') #### Deep Transition Recurrent Layer rec = eval(state['rec_layer'])( rng, eval(state['nhids']), activation=eval(state['rec_activ']), #activation = 'TT.nnet.sigmoid', bias_scale=eval(state['rec_bias']), scale=eval(state['rec_scale']), sparsity=eval(state['rec_sparse']), init_fn=eval(state['rec_init']), weight_noise=state['weight_noise'], name='rec') #### Stiching them together ##### (1) Get the embedding of a word x_emb = emb_words(x, no_noise_bias=state['no_noise_bias']) ##### (2) Embedding + Hidden State via DT Recurrent Layer reset = TT.scalar('reset') rec_layer = rec(x_emb, n_steps=x.shape[0], init_state=h0 * reset, no_noise_bias=state['no_noise_bias'], truncate_gradient=state['truncate_gradient'], batch_size=1) ## BEGIN Exercise: DOT-RNN ### Neural Implementation of the Operators: \lhd #### Exercise (1) #### TODO: Define a layer from the hidden state to the intermediate layer emb_layer = MultiLayer(rng, ) #### Exercise (1) #### TODO: Define a layer from the input to the intermediate Layer #### Hidden State: Combine emb_state and emb_words_out #### Exercise (1) #### TODO: Define an activation layer #### Exercise (2) #### TODO: Define a dropout layer #### Softmax Layer output_layer = SoftmaxLayer(rng, eval(state['dout_nhid']), state['n_out'], scale=state['out_scale'], bias_scale=state['out_bias_scale'], init_fn="sample_weights_classic", weight_noise=state['weight_noise'], sparsity=state['out_sparse'], sum_over_time=True, name='out') ### Few Optional Things #### Direct shortcut from x to y if state['shortcut_inpout']: shortcut = MultiLayer(rng, n_in=state['n_in'], n_hids=eval(state['inpout_nhids']), activations=eval(state['inpout_activ']), init_fn='sample_weights_classic', weight_noise=state['weight_noise'], scale=eval(state['inpout_scale']), sparsity=eval(state['inpout_sparse']), learn_bias=eval(state['inpout_learn_bias']), bias_scale=eval(state['inpout_bias']), name='shortcut') #### Learning rate scheduling (1/(1+n/beta)) state['clr'] = state['lr'] def update_lr(obj, cost): stp = obj.step if isinstance(obj.state['lr_start'], int) and stp > obj.state['lr_start']: time = float(stp - obj.state['lr_start']) new_lr = obj.state['clr'] / (1 + time / obj.state['lr_beta']) obj.lr = new_lr if state['lr_adapt']: rec.add_schedule(update_lr) ### Neural Implementations of the Language Model #### Training if state['shortcut_inpout']: additional_inputs = [rec_layer, shortcut(x)] else: additional_inputs = [rec_layer] ##### Exercise (1): Compute the output intermediate layer ##### TODO: Compute the output intermediate layer ##### Exercise (2): Apply Dropout ##### TODO: Apply the dropout layer train_model = output_layer(outhid, no_noise_bias=state['no_noise_bias'], additional_inputs=additional_inputs).train( target=y, scale=numpy.float32(1. / state['seqlen'])) nw_h0 = rec_layer.out[rec_layer.out.shape[0] - 1] if state['carry_h0']: train_model.updates += [(h0, nw_h0)] #### Validation h0val = theano.shared( numpy.zeros((eval(state['nhids'])[-1], ), dtype='float32')) rec_layer = rec(emb_words(x, use_noise=False), n_steps=x.shape[0], batch_size=1, init_state=h0val * reset, use_noise=False) nw_h0 = rec_layer.out[rec_layer.out.shape[0] - 1] ##### Exercise (1): ##### TODO: Compute the output intermediate layer ##### Exercise (2): Apply Dropout ##### TODO: Apply the dropout layer without noise if state['shortcut_inpout']: additional_inputs = [rec_layer, shortcut(x, use_noise=False)] else: additional_inputs = [rec_layer] valid_model = output_layer(outhid, additional_inputs=additional_inputs, use_noise=False).validate(target=y, sum_over_time=True) valid_updates = [] if state['carry_h0']: valid_updates = [(h0val, nw_h0)] valid_fn = theano.function([x, y, reset], valid_model.cost, name='valid_fn', updates=valid_updates) #### Sampling ##### single-step sampling def sample_fn(word_tm1, h_tm1): x_emb = emb_words(word_tm1, use_noise=False, one_step=True) h0 = rec(x_emb, state_before=h_tm1, one_step=True, use_noise=False)[-1] outhid = outhid_dropout(outhid_activ( emb_state(h0, use_noise=False, one_step=True) + emb_words_out(word_tm1, use_noise=False, one_step=True), one_step=True), use_noise=False, one_step=True) word = output_layer.get_sample(state_below=outhid, additional_inputs=[h0], temp=1.) return word, h0 ##### scan for iterating the single-step sampling multiple times [samples, summaries], updates = scan(sample_fn, states=[ TT.alloc(numpy.int64(0), state['sample_steps']), TT.alloc(numpy.float32(0), 1, eval(state['nhids'])[-1]) ], n_steps=state['sample_steps'], name='sampler_scan') ##### build a Theano function for sampling sample_fn = theano.function([], [samples], updates=updates, profile=False, name='sample_fn') ##### Load a dictionary dictionary = numpy.load(state['dictionary']) if state['chunks'] == 'chars': dictionary = dictionary['unique_chars'] else: dictionary = dictionary['unique_words'] def hook_fn(): sample = sample_fn()[0] print 'Sample:', if state['chunks'] == 'chars': print "".join(dictionary[sample]) else: for si in sample: print dictionary[si], print ### Build and Train a Model #### Define a model model = LM_Model(cost_layer=train_model, weight_noise_amount=state['weight_noise_amount'], valid_fn=valid_fn, clean_before_noise_fn=False, noise_fn=None, rng=rng) if state['reload']: model.load(state['prefix'] + 'model.npz') #### Define a trainer ##### Training algorithm (SGD) if state['moment'] < 0: algo = SGD(model, state, train_data) else: algo = SGD_m(model, state, train_data) ##### Main loop of the trainer main = MainLoop(train_data, valid_data, test_data, model, algo, state, channel, train_cost=False, hooks=hook_fn, validate_postprocess=eval(state['validate_postprocess'])) ## Run! main.main()
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of language modeling with RNN", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument("prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument("state", nargs="?", default="", help="Changes to Groundhog state") parser.add_argument("--path", help="Path to a language dataset") parser.add_argument("--dict", help="Path to the dataset dictionary") parser.add_argument("--restart", help="Start anew") parser.add_argument("--reset", action="store_true", default=False, help="Reset the hidden state between batches") parser.add_argument("--steps", type=int, default=100, help="Number of steps to plot for the 'sample' mode" " OR training sequence length for the 'train' mode.") args = parser.parse_args() logger.debug("Args:\n" + str(args)) dim = 200 num_chars = 50 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator(LinearReadout( readout_dim=num_chars, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(num_chars, dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) if args.mode == "train": batch_size = 1 seq_len = args.steps generator.initialize() # Build cost computation graph that uses the saved hidden states. # An issue: for Groundhog this is completely transparent, that's # why it does not carry the hidden state over the period when # validation in done. We should find a way to fix in the future. x = tensor.lmatrix('x') init_states = shared_floatx_zeros((batch_size, dim), name='init_states') reset = tensor.scalar('reset') cost = ComputationGraph( generator.cost(x, states=init_states * reset).sum()) # TODO: better search routine states = [ v for v in cost.variables if hasattr(v.tag, 'application_call') and v.tag.application_call.brick == generator.transition and (v.tag.application_call.application == generator.transition.apply) and v.tag.role == VariableRole.OUTPUT and v.tag.name == 'states' ] assert len(states) == 1 states = states[0] gh_model = GroundhogModel(generator, cost) gh_model.properties.append( ('bpc', cost.outputs[0] * numpy.log(2) / seq_len)) gh_model.properties.append(('mean_init_state', init_states.mean())) gh_model.properties.append(('reset', reset)) if not args.reset: gh_model.updates.append((init_states, states[-1])) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() changes = eval("dict({})".format(args.state)) state.update(changes) def output_format(x, y, reset): return dict(x=x[:, None], reset=reset) train, valid, test = [ LMIterator(batch_size=batch_size, use_infinite_loop=mode == 'train', path=args.path, seq_len=seq_len, mode=mode, chunks='chars', output_format=output_format, can_fit=True) for mode in ['train', 'valid', 'test'] ] trainer = SGD(gh_model, state, train) state['on_nan'] = 'warn' state['cutoff'] = 1. main_loop = MainLoop(train, valid, None, gh_model, trainer, state, None) if not args.restart: main_loop.load() main_loop.main() elif args.mode == "sample": load_params(generator, args.prefix + "model.npz") chars = numpy.load(args.dict)['unique_chars'] sample = ComputationGraph( generator.generate(n_steps=args.steps, batch_size=10, iterate=True)).function() states, outputs, costs = sample() for i in range(10): print("Generation cost: {}".format(costs[:, i].sum())) print("".join([chars[o] for o in outputs[:, i]])) else: assert False
def jobman(state, channel): # load dataset state['null_sym_source'] = 15000 state['null_sym_target'] = 15000 state['n_sym_source'] = state['null_sym_source'] + 1 state['n_sym_target'] = state['null_sym_target'] + 1 state['nouts'] = state['n_sym_target'] state['nins'] = state['n_sym_source'] rng = numpy.random.RandomState(state['seed']) if state['loopIters'] > 0: train_data, valid_data, test_data = get_data(state) else: train_data = None valid_data = None test_data = None ########### Training graph ##################### ## 1. Inputs if state['bs'] == 1: x = TT.lvector('x') x_mask = TT.vector('x_mask') y = TT.lvector('y') y0 = y y_mask = TT.vector('y_mask') else: x = TT.lmatrix('x') x_mask = TT.matrix('x_mask') y = TT.lmatrix('y') y0 = y y_mask = TT.matrix('y_mask') # 2. Layers and Operators bs = state['bs'] embdim = state['dim_mlp'] # Source Sentence emb = MultiLayer( rng, n_in=state['nins'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb') emb_words = [] if state['rec_gating']: gater_words = [] if state['rec_reseting']: reseter_words = [] for si in xrange(state['encoder_stack']): emb_words.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_%d'%si)) if state['rec_gating']: gater_words.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='gater_words_%d'%si)) if state['rec_reseting']: reseter_words.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='reseter_words_%d'%si)) add_rec_step = [] rec_proj = [] if state['rec_gating']: rec_proj_gater = [] if state['rec_reseting']: rec_proj_reseter = [] for si in xrange(state['encoder_stack']): if si > 0: rec_proj.append(MultiLayer( rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d'%si)) if state['rec_gating']: rec_proj_gater.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='rec_proj_gater_%d'%si)) if state['rec_reseting']: rec_proj_reseter.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='rec_proj_reseter_%d'%si)) add_rec_step.append(eval(state['rec_layer'])( rng, n_hids=state['dim'], activation = state['activ'], bias_scale = state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_%d'%si)) def _add_op(words_embeddings, words_mask=None, prev_val=None, si = 0, state_below = None, gater_below = None, reseter_below = None, one_step=False, bs=1, init_state=None, use_noise=True): seqlen = words_embeddings.out.shape[0]//bs rval = words_embeddings gater = None reseter = None if state['rec_gating']: gater = gater_below if state['rec_reseting']: reseter = reseter_below if si > 0: rval += rec_proj[si-1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_gater[si-1](state_below, one_step=one_step, use_noise = use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_reseter[si-1](state_below, one_step=one_step, use_noise = use_noise) if reseter: reseter += projg else: reseter = projg if not one_step: rval= add_rec_step[si]( rval, nsteps=seqlen, batch_size=bs, mask=words_mask, gater_below = gater, reseter_below = reseter, one_step=one_step, init_state=init_state, use_noise = use_noise) else: rval= add_rec_step[si]( rval, mask=words_mask, state_before=prev_val, gater_below = gater, reseter_below = reseter, one_step=one_step, init_state=init_state, use_noise = use_noise) return rval add_op = Operator(_add_op) # Target Sentence emb_t = MultiLayer( rng, n_in=state['nouts'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_t') emb_words_t = [] if state['rec_gating']: gater_words_t = [] if state['rec_reseting']: reseter_words_t = [] for si in xrange(state['decoder_stack']): emb_words_t.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_t_%d'%si)) if state['rec_gating']: gater_words_t.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='gater_words_t_%d'%si)) if state['rec_reseting']: reseter_words_t.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='reseter_words_t_%d'%si)) proj_everything_t = [] if state['rec_gating']: gater_everything_t = [] if state['rec_reseting']: reseter_everything_t = [] for si in xrange(state['decoder_stack']): proj_everything_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='proj_everything_t_%d'%si, learn_bias = False)) if state['rec_gating']: gater_everything_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='gater_everything_t_%d'%si, learn_bias = False)) if state['rec_reseting']: reseter_everything_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='reseter_everything_t_%d'%si, learn_bias = False)) add_rec_step_t = [] rec_proj_t = [] if state['rec_gating']: rec_proj_t_gater = [] if state['rec_reseting']: rec_proj_t_reseter = [] for si in xrange(state['decoder_stack']): if si > 0: rec_proj_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d'%si)) if state['rec_gating']: rec_proj_t_gater.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_gater_%d'%si)) if state['rec_reseting']: rec_proj_t_reseter.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_reseter_%d'%si)) add_rec_step_t.append(eval(state['rec_layer'])( rng, n_hids=state['dim'], activation = state['activ'], bias_scale = state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_t_%d'%si)) if state['encoder_stack'] > 1: encoder_proj = [] for si in xrange(state['encoder_stack']): encoder_proj.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim'] * state['maxout_part']], activation=['lambda x: x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='encoder_proj_%d'%si, learn_bias = (si == 0))) encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']), indim = indim, pieces = pieces, rng=rng) def _add_t_op(words_embeddings, everything = None, words_mask=None, prev_val=None,one_step=False, bs=1, init_state=None, use_noise=True, gater_below = None, reseter_below = None, si = 0, state_below = None): seqlen = words_embeddings.out.shape[0]//bs rval = words_embeddings gater = None if state['rec_gating']: gater = gater_below reseter = None if state['rec_reseting']: reseter = reseter_below if si > 0: if isinstance(state_below, list): state_below = state_below[-1] rval += rec_proj_t[si-1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_t_gater[si-1](state_below, one_step=one_step, use_noise = use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_t_reseter[si-1](state_below, one_step=one_step, use_noise = use_noise) if reseter: reseter += projg else: reseter = projg if everything: rval = rval + proj_everything_t[si](everything) if state['rec_gating']: everyg = gater_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if gater: gater += everyg else: gater = everyg if state['rec_reseting']: everyg = reseter_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if reseter: reseter += everyg else: reseter = everyg if not one_step: rval = add_rec_step_t[si]( rval, nsteps=seqlen, batch_size=bs, mask=words_mask, one_step=one_step, init_state=init_state, gater_below = gater, reseter_below = reseter, use_noise = use_noise) else: rval = add_rec_step_t[si]( rval, mask=words_mask, state_before=prev_val, one_step=one_step, gater_below = gater, reseter_below = reseter, use_noise = use_noise) return rval add_t_op = Operator(_add_t_op) outdim = state['dim_mlp'] if not state['deep_out']: outdim = state['rank_n_approx'] if state['bias_code']: bias_code = [] for si in xrange(state['decoder_stack']): bias_code.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation = [state['activ']], bias_scale = [state['bias']], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='bias_code_%d'%si)) if state['avg_word']: word_code_nin = state['rank_n_approx'] word_code = MultiLayer( rng, n_in=word_code_nin, n_hids=[outdim], activation = 'lambda x:x', bias_scale = [state['bias_mlp']/3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias = False, name='word_code') proj_code = MultiLayer( rng, n_in=state['dim'], n_hids=[outdim], activation = 'lambda x: x', bias_scale = [state['bias_mlp']/3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias = False, name='proj_code') proj_h = [] for si in xrange(state['decoder_stack']): proj_h.append(MultiLayer( rng, n_in=state['dim'], n_hids=[outdim], activation = 'lambda x: x', bias_scale = [state['bias_mlp']/3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='proj_h_%d'%si)) if state['bigram']: proj_word = MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[outdim], activation=['lambda x:x'], bias_scale = [state['bias_mlp']/3], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='emb_words_lm') if state['deep_out']: indim = 0 pieces = 0 act_layer = UnaryOp(activation=eval(state['unary_activ'])) drop_layer = DropOp(rng=rng, dropout=state['dropout']) if state['deep_out']: indim = state['dim_mlp'] / state['maxout_part'] rank_n_approx = state['rank_n_approx'] rank_n_activ = state['rank_n_activ'] else: indim = state['rank_n_approx'] rank_n_approx = 0 rank_n_activ = None output_layer = SoftmaxLayer( rng, indim, state['nouts'], state['weight_scale'], -1, rank_n_approx = rank_n_approx, rank_n_activ = rank_n_activ, weight_noise=state['weight_noise'], init_fn=state['weight_init_fn'], name='out') def _pop_op(everything, accum, everything_max = None, everything_min = None, word = None, aword = None, one_step=False, use_noise=True): rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise) for si in xrange(1,state['decoder_stack']): rval += proj_h[si](accum[si], one_step=one_step, use_noise=use_noise) if state['mult_out']: rval = rval * everything else: rval = rval + everything if aword and state['avg_word']: wcode = aword if one_step: if state['mult_out']: rval = rval * wcode else: rval = rval + wcode else: if not isinstance(wcode, TT.TensorVariable): wcode = wcode.out shape = wcode.shape rshape = rval.shape rval = rval.reshape([rshape[0]/shape[0], shape[0], rshape[1]]) wcode = wcode.dimshuffle('x', 0, 1) if state['mult_out']: rval = rval * wcode else: rval = rval + wcode rval = rval.reshape(rshape) if word and state['bigram']: if one_step: if state['mult_out']: rval *= proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: rval += proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: if isinstance(word, TT.TensorVariable): shape = word.shape ndim = word.ndim else: shape = word.shape ndim = word.out.ndim pword = proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) shape_pword = pword.shape if ndim == 1: pword = Shift()(pword.reshape([shape[0], 1, outdim])) else: pword = Shift()(pword.reshape([shape[0], shape[1], outdim])) if state['mult_out']: rval *= pword.reshape(shape_pword) else: rval += pword.reshape(shape_pword) if state['deep_out']: rval = drop_layer(act_layer(rval), use_noise=use_noise) return rval pop_op = Operator(_pop_op) # 3. Constructing the model gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [add_op(emb_words[0](emb(x)), x_mask, bs=x_mask.shape[1], si=0, gater_below=gater_below, reseter_below=reseter_below)] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1])) for si in xrange(1,state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append(add_op(emb_words[si](emb(x)), x_mask, bs=x_mask.shape[1], si=si, state_below=encoder_acts[-1], gater_below=gater_below, reseter_below=reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1])) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = LastState(ntimes=True,n=y.shape[0])(encoder) else: everything = encoder_act_layer(everything) everything = everything.reshape([1, everything.shape[0], everything.shape[1]]) everything = LastState(ntimes=True,n=y.shape[0])(everything) if state['bias_code']: init_state = [bc(everything[-1]) for bc in bias_code] else: init_state = [None for bc in bias_code] if state['avg_word']: shape = x.shape pword = emb(x).out.reshape([shape[0], shape[1], state['rank_n_approx']]) pword = pword * x_mask.dimshuffle(0, 1, 'x') aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x')) aword = word_code(aword, use_noise=False) else: aword = None gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(y0)) has_said = [add_t_op(emb_words_t[0](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], gater_below = gater_below, reseter_below = reseter_below, init_state=init_state[0], si=0)] for si in xrange(1,state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(y0)) has_said.append(add_t_op(emb_words_t[si](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], state_below = has_said[-1], gater_below = gater_below, reseter_below = reseter_below, init_state=init_state[si], si=si)) if has_said[0].out.ndim < 3: for si in xrange(state['decoder_stack']): shape_hs = has_said[si].shape if y0.ndim == 1: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape([shape[0], 1, state['dim_mlp']])) else: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape([shape[0], shape[1], state['dim_mlp']])) has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si]) has_said[si] = has_said[si].reshape(shape_hs) else: for si in xrange(state['decoder_stack']): has_said[si] = Shift()(has_said[si]) has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si]) model = pop_op(proj_code(everything), has_said, word=y0, aword = aword) nll = output_layer.train(state_below=model, target=y0, mask=y_mask, reg=None) / TT.cast(y.shape[0]*y.shape[1], 'float32') valid_fn = None noise_fn = None x = TT.lvector(name='x') n_steps = TT.iscalar('nsteps') temp = TT.scalar('temp') gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [add_op(emb_words[0](emb(x),use_noise=False), si=0, use_noise=False, gater_below=gater_below, reseter_below=reseter_below)] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False) for si in xrange(1,state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append(add_op(emb_words[si](emb(x),use_noise=False), si=si, state_below=encoder_acts[-1], use_noise=False, gater_below = gater_below, reseter_below = reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1]), use_noise=False) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = last(encoder) else: everything = encoder_act_layer(everything) init_state = [] for si in xrange(state['decoder_stack']): if state['bias_code']: init_state.append(TT.reshape(bias_code[si](everything, use_noise=False), [1, state['dim']])) else: init_state.append(TT.alloc(numpy.float32(0), 1, state['dim'])) if state['avg_word']: aword = emb(x,use_noise=False).out.mean(0) aword = word_code(aword, use_noise=False) else: aword = None def sample_fn(*args): aidx = 0; word_tm1 = args[aidx] aidx += 1; prob_tm1 = args[aidx] has_said_tm1 = [] for si in xrange(state['decoder_stack']): aidx += 1; has_said_tm1.append(args[aidx]) aidx += 1; ctx = args[aidx] if state['avg_word']: aidx += 1; awrd = args[aidx] val = pop_op(proj_code(ctx), has_said_tm1, word=word_tm1, aword=awrd, one_step=True, use_noise=False) sample = output_layer.get_sample(state_below=val, temp=temp) logp = output_layer.get_cost( state_below=val.out.reshape([1, TT.cast(output_layer.n_in, 'int64')]), temp=temp, target=sample.reshape([1,1]), use_noise=False) gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(sample)) has_said_t = [add_t_op(emb_words_t[0](emb_t(sample)), ctx, prev_val=has_said_tm1[0], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=0)] for si in xrange(1, state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(sample)) has_said_t.append(add_t_op(emb_words_t[si](emb_t(sample)), ctx, prev_val=has_said_tm1[si], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=si, state_below=has_said_t[-1])) for si in xrange(state['decoder_stack']): if isinstance(has_said_t[si], list): has_said_t[si] = has_said_t[si][-1] rval = [sample, TT.cast(logp, 'float32')] + has_said_t return rval sampler_params = [everything] if state['avg_word']: sampler_params.append(aword) states = [TT.alloc(numpy.int64(0), n_steps)] states.append(TT.alloc(numpy.float32(0), n_steps)) states += init_state outputs, updates = scan(sample_fn, states = states, params = sampler_params, n_steps= n_steps, name='sampler_scan' ) samples = outputs[0] probs = outputs[1] sample_fn = theano.function( [n_steps, temp, x], [samples, probs.sum()], updates=updates, profile=False, name='sample_fn') model = LM_Model( cost_layer = nll, weight_noise_amount=state['weight_noise_amount'], valid_fn = valid_fn, sample_fn = sample_fn, clean_before_noise_fn = False, noise_fn = noise_fn, indx_word=state['indx_word_target'], indx_word_src=state['indx_word'], character_level = False, rng = rng) if state['loopIters'] > 0: algo = SGD(model, state, train_data) else: algo = None def hook_fn(): if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs old_offset = train_data.offset if state['sample_reset']: train_data.reset() ns = 0 for sidx in xrange(state['sample_n']): while True: batch = train_data.next() if batch: break x = batch['x'] y = batch['y'] #xbow = batch['x_bow'] masks = batch['x_mask'] if x.ndim > 1: for idx in xrange(x.shape[1]): ns += 1 if ns > state['sample_max']: break print 'Input: ', for k in xrange(x[:,idx].shape[0]): print model.word_indxs_src[x[:,idx][k]], if model.word_indxs_src[x[:,idx][k]] == '<eol>': break print '' print 'Target: ', for k in xrange(y[:,idx].shape[0]): print model.word_indxs[y[:,idx][k]], if model.word_indxs[y[:,idx][k]] == '<eol>': break print '' senlen = len(x[:,idx]) if len(numpy.where(masks[:,idx]==0)[0]) > 0: senlen = numpy.where(masks[:,idx]==0)[0][0] if senlen < 1: continue xx = x[:senlen, idx] #xx = xx.reshape([xx.shape[0], 1]) model.get_samples(state['seqlen']+1, 1, xx) else: ns += 1 model.get_samples(state['seqlen']+1, 1, x) if ns > state['sample_max']: break train_data.offset = old_offset return main = MainLoop(train_data, valid_data, None, model, algo, state, channel, reset = state['reset'], hooks = hook_fn) if state['reload']: main.load() if state['loopIters'] > 0: main.main() if state['sampler_test']: # This is a test script: we only sample if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs indx_word=pkl.load(open(state['word_indx'],'rb')) try: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = float(raw_input('Inverse Temperature? ')) seqin = seqin.lower() seqin = seqin.split() seqlen = len(seqin) seq = numpy.zeros(seqlen+1, dtype='int64') for idx,sx in enumerate(seqin): try: seq[idx] = indx_word[sx] except: seq[idx] = indx_word[state['oov']] seq[-1] = state['null_sym_source'] except Exception: print 'Something wrong with your input! Try again!' continue sentences = [] all_probs = [] for sidx in xrange(n_samples): #import ipdb; ipdb.set_trace() [values, probs] = model.sample_fn(seqlen * 3, alpha, seq) sen = [] for k in xrange(values.shape[0]): if model.word_indxs[values[k]] == '<eol>': break sen.append(model.word_indxs[values[k]]) sentences.append(" ".join(sen)) all_probs.append(-probs) sprobs = numpy.argsort(all_probs) for pidx in sprobs: print pidx,"(%f):"%(-all_probs[pidx]),sentences[pidx] print except KeyboardInterrupt: print 'Interrupted' pass
state['word_indx_trgt'] = prel('vocab.lang2.pkl') update_custom_keys(state, conf, ['bs', 'loopIters', 'timeStop', 'dim', 'null_sym_source', 'null_sym_target']) if conf['method'] == 'RNNenc-50': state['prefix'] = 'encdec-50_' state['seqlen'] = 50 state['sort_k_batches'] = 20 log.debug("State:\n{}".format(pprint.pformat(state))) rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, False) enc_dec.build() lm_model = enc_dec.create_lm_model() log.debug("Load data") train_data = get_batch_iterator(state) log.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) log.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main()
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating simple 1d sequences with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "plot"], help="The mode to run. Use `train` to train a new model" " and `plot` to plot a sequence generated by an" " existing one.") parser.add_argument( "prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument( "--input-noise", type=float, default=0.0, help="Adds Gaussian noise of given intensity to the " " training sequences.") parser.add_argument( "--function", default="lambda a, x: numpy.sin(a * x)", help="An analytical description of the sequence family to learn." " The arguments before the last one are considered parameters.") parser.add_argument( "--steps", type=int, default=100, help="Number of steps to plot") parser.add_argument( "--params", help="Parameter values for plotting") args = parser.parse_args() function = eval(args.function) num_params = len(inspect.getargspec(function).args) - 1 class Emitter(TrivialEmitter): @application def cost(self, readouts, outputs): """Compute MSE.""" return ((readouts - outputs) ** 2).sum(axis=readouts.ndim - 1) transition = GatedRecurrent( name="transition", activation=Tanh(), dim=10, weights_init=Orthogonal()) with_params = AddParameters(transition, num_params, "params", name="with_params") generator = SequenceGenerator( LinearReadout(readout_dim=1, source_names=["states"], emitter=Emitter(name="emitter"), name="readout"), with_params, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) if args.mode == "train": seed = 1 rng = numpy.random.RandomState(seed) batch_size = 10 generator.initialize() cost = ComputationGraph( generator.cost(tensor.tensor3('x'), params=tensor.matrix("params")).sum()) cost = apply_noise(cost, cost.inputs, args.input_noise) gh_model = GroundhogModel(generator, cost) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() data = SeriesIterator(rng, function, 100, batch_size) trainer = SGD(gh_model, state, data) main_loop = MainLoop(data, None, None, gh_model, trainer, state, None) main_loop.load() main_loop.main() elif args.mode == "plot": load_params(generator, args.prefix + "model.npz") params = tensor.matrix("params") sample = theano.function([params], generator.generate( params=params, n_steps=args.steps, batch_size=1)) param_values = numpy.array(map(float, args.params.split()), dtype=floatX) states, outputs, _ = sample(param_values[None, :]) actual = outputs[:, 0, 0] desired = numpy.array([function(*(list(param_values) + [T])) for T in range(args.steps)]) print("MSE: {}".format(((actual - desired) ** 2).sum())) pyplot.plot(numpy.hstack([actual[:, None], desired[:, None]])) pyplot.show() else: assert False
def jobman(state, channel): # load dataset rng = numpy.random.RandomState(state['seed']) # declare the dimensionalies of the input and output if state['chunks'] == 'words': state['n_in'] = 10000 state['n_out'] = 10000 else: state['n_in'] = 50 state['n_out'] = 50 train_data, valid_data, test_data = get_text_data(state) ## BEGIN Tutorial ### Define Theano Input Variables x = TT.lvector('x') y = TT.lvector('y') h0 = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32')) ### Neural Implementation of the Operators: \oplus #### Word Embedding emb_words = MultiLayer( rng, n_in=state['n_in'], n_hids=eval(state['inp_nhids']), activation=eval(state['inp_activ']), init_fn='sample_weights_classic', weight_noise=state['weight_noise'], rank_n_approx = state['rank_n_approx'], scale=state['inp_scale'], sparsity=state['inp_sparse'], learn_bias = True, bias_scale=eval(state['inp_bias']), name='emb_words') #### Deep Transition Recurrent Layer rec = eval(state['rec_layer'])( rng, eval(state['nhids']), activation = eval(state['rec_activ']), #activation = 'TT.nnet.sigmoid', bias_scale = eval(state['rec_bias']), scale=eval(state['rec_scale']), sparsity=eval(state['rec_sparse']), init_fn=eval(state['rec_init']), weight_noise=state['weight_noise'], name='rec') #### Stiching them together ##### (1) Get the embedding of a word x_emb = emb_words(x, no_noise_bias=state['no_noise_bias']) ##### (2) Embedding + Hidden State via DT Recurrent Layer reset = TT.scalar('reset') rec_layer = rec(x_emb, n_steps=x.shape[0], init_state=h0*reset, no_noise_bias=state['no_noise_bias'], truncate_gradient=state['truncate_gradient'], batch_size=1) ## BEGIN Exercise: DOT-RNN ### Neural Implementation of the Operators: \lhd #### Exercise (1) #### TODO: Define a layer from the hidden state to the intermediate layer #### Exercise (1) #### TODO: Define a layer from the input to the intermediate Layer #### Hidden State: Combine emb_state and emb_words_out #### Exercise (1) #### TODO: Define an activation layer #### Exercise (2) #### TODO: Define a dropout layer #### Softmax Layer output_layer = SoftmaxLayer( rng, eval(state['dout_nhid']), state['n_out'], scale=state['out_scale'], bias_scale=state['out_bias_scale'], init_fn="sample_weights_classic", weight_noise=state['weight_noise'], sparsity=state['out_sparse'], sum_over_time=True, name='out') ### Few Optional Things #### Direct shortcut from x to y if state['shortcut_inpout']: shortcut = MultiLayer( rng, n_in=state['n_in'], n_hids=eval(state['inpout_nhids']), activations=eval(state['inpout_activ']), init_fn='sample_weights_classic', weight_noise = state['weight_noise'], scale=eval(state['inpout_scale']), sparsity=eval(state['inpout_sparse']), learn_bias=eval(state['inpout_learn_bias']), bias_scale=eval(state['inpout_bias']), name='shortcut') #### Learning rate scheduling (1/(1+n/beta)) state['clr'] = state['lr'] def update_lr(obj, cost): stp = obj.step if isinstance(obj.state['lr_start'], int) and stp > obj.state['lr_start']: time = float(stp - obj.state['lr_start']) new_lr = obj.state['clr']/(1+time/obj.state['lr_beta']) obj.lr = new_lr if state['lr_adapt']: rec.add_schedule(update_lr) ### Neural Implementations of the Language Model #### Training if state['shortcut_inpout']: additional_inputs = [rec_layer, shortcut(x)] else: additional_inputs = [rec_layer] ##### Exercise (1): Compute the output intermediate layer ##### TODO: Compute the output intermediate layer ##### Exercise (2): Apply Dropout ##### TODO: Apply the dropout layer train_model = output_layer(outhid, no_noise_bias=state['no_noise_bias'], additional_inputs=additional_inputs).train(target=y, scale=numpy.float32(1./state['seqlen'])) nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1] if state['carry_h0']: train_model.updates += [(h0, nw_h0)] #### Validation h0val = theano.shared(numpy.zeros((eval(state['nhids'])[-1],), dtype='float32')) rec_layer = rec(emb_words(x, use_noise=False), n_steps = x.shape[0], batch_size=1, init_state=h0val*reset, use_noise=False) nw_h0 = rec_layer.out[rec_layer.out.shape[0]-1] ##### Exercise (1): ##### TODO: Compute the output intermediate layer ##### Exercise (2): Apply Dropout ##### TODO: Apply the dropout layer without noise if state['shortcut_inpout']: additional_inputs=[rec_layer, shortcut(x, use_noise=False)] else: additional_inputs=[rec_layer] valid_model = output_layer(outhid, additional_inputs=additional_inputs, use_noise=False).validate(target=y, sum_over_time=True) valid_updates = [] if state['carry_h0']: valid_updates = [(h0val, nw_h0)] valid_fn = theano.function([x,y, reset], valid_model.out, name='valid_fn', updates=valid_updates) #### Sampling ##### single-step sampling def sample_fn(word_tm1, h_tm1): x_emb = emb_words(word_tm1, use_noise = False, one_step=True) h0 = rec(x_emb, state_before=h_tm1, one_step=True, use_noise=False)[-1] outhid = outhid_dropout(outhid_activ(emb_state(h0, use_noise=False, one_step=True) + emb_words_out(word_tm1, use_noise=False, one_step=True), one_step=True), use_noise=False, one_step=True) word = output_layer.get_sample(state_below=outhid, additional_inputs=[h0], temp=1.) return word, h0 ##### scan for iterating the single-step sampling multiple times [samples, summaries], updates = scan(sample_fn, states = [ TT.alloc(numpy.int64(0), state['sample_steps']), TT.alloc(numpy.float32(0), 1, eval(state['nhids'])[-1])], n_steps= state['sample_steps'], name='sampler_scan') ##### build a Theano function for sampling sample_fn = theano.function([], [samples], updates=updates, profile=False, name='sample_fn') ##### Load a dictionary dictionary = numpy.load(state['dictionary']) if state['chunks'] == 'chars': dictionary = dictionary['unique_chars'] else: dictionary = dictionary['unique_words'] def hook_fn(): sample = sample_fn()[0] print 'Sample:', if state['chunks'] == 'chars': print "".join(dictionary[sample]) else: for si in sample: print dictionary[si], print ### Build and Train a Model #### Define a model model = LM_Model( cost_layer = train_model, weight_noise_amount=state['weight_noise_amount'], valid_fn = valid_fn, clean_before_noise_fn = False, noise_fn = None, rng = rng) if state['reload']: model.load(state['prefix']+'model.npz') #### Define a trainer ##### Training algorithm (SGD) if state['moment'] < 0: algo = SGD(model, state, train_data) else: algo = SGD_m(model, state, train_data) ##### Main loop of the trainer main = MainLoop(train_data, valid_data, test_data, model, algo, state, channel, train_cost = False, hooks = hook_fn, validate_postprocess = eval(state['validate_postprocess'])) ## Run! main.main()
def main(): args = parse_args() state = getattr(experiments.nmt, args.proto)() if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig(level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'fixed_embeddings' not in state: state['fixed_embeddings'] = False if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False if 'reprocess_each_iteration' not in state: state['reprocess_each_iteration'] = False rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, args.skip_init) enc_dec.build() lm_model = enc_dec.create_lm_model() logger.debug("Load data") train_data = get_batch_iterator(state, rng) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) if state['rolling_vocab']: logger.debug("Initializing extra parameters") init_extra_parameters(lm_model, state) if not state['fixed_embeddings']: init_adadelta_extra_parameters(algo, state) with open(state['rolling_vocab_dict'], 'rb') as f: lm_model.rolling_vocab_dict = cPickle.load(f) lm_model.total_num_batches = max(lm_model.rolling_vocab_dict) lm_model.Dx_shelve = shelve.open(state['Dx_file']) lm_model.Dy_shelve = shelve.open(state['Dy_file']) hooks = [] if state['hookFreq'] >= 0: hooks.append(RandomSamplePrinter(state, lm_model, train_data)) if 'external_validation_script' in state and state['external_validation_script']: hooks.append(ExternalValidator(state, lm_model)) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], hooks= hooks) if state['reload']: main.load() if state['loopIters'] > 0: main.main() if state['rolling_vocab']: lm_model.Dx_shelve.close() lm_model.Dy_shelve.close()
def jobman(state, channel): # load dataset state['null_sym_source'] = 15000 state['null_sym_target'] = 15000 state['n_sym_source'] = state['null_sym_source'] + 1 state['n_sym_target'] = state['null_sym_target'] + 1 state['nouts'] = state['n_sym_target'] state['nins'] = state['n_sym_source'] rng = numpy.random.RandomState(state['seed']) if state['loopIters'] > 0: train_data, valid_data, test_data = get_data(state) else: train_data = None valid_data = None test_data = None ########### Training graph ##################### ## 1. Inputs if state['bs'] == 1: x = TT.lvector('x') x_mask = TT.vector('x_mask') y = TT.lvector('y') y0 = y y_mask = TT.vector('y_mask') else: x = TT.lmatrix('x') x_mask = TT.matrix('x_mask') y = TT.lmatrix('y') y0 = y y_mask = TT.matrix('y_mask') # 2. Layers and Operators bs = state['bs'] embdim = state['dim_mlp'] # Source Sentence emb = MultiLayer(rng, n_in=state['nins'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb') emb_words = [] if state['rec_gating']: gater_words = [] if state['rec_reseting']: reseter_words = [] for si in xrange(state['encoder_stack']): emb_words.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_%d' % si)) if state['rec_gating']: gater_words.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='gater_words_%d' % si)) if state['rec_reseting']: reseter_words.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='reseter_words_%d' % si)) add_rec_step = [] rec_proj = [] if state['rec_gating']: rec_proj_gater = [] if state['rec_reseting']: rec_proj_reseter = [] for si in xrange(state['encoder_stack']): if si > 0: rec_proj.append( MultiLayer(rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d' % si)) if state['rec_gating']: rec_proj_gater.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_gater_%d' % si)) if state['rec_reseting']: rec_proj_reseter.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_reseter_%d' % si)) add_rec_step.append( eval(state['rec_layer'])(rng, n_hids=state['dim'], activation=state['activ'], bias_scale=state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_%d' % si)) def _add_op(words_embeddings, words_mask=None, prev_val=None, si=0, state_below=None, gater_below=None, reseter_below=None, one_step=False, bs=1, init_state=None, use_noise=True): seqlen = words_embeddings.out.shape[0] // bs rval = words_embeddings gater = None reseter = None if state['rec_gating']: gater = gater_below if state['rec_reseting']: reseter = reseter_below if si > 0: rval += rec_proj[si - 1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_gater[si - 1](state_below, one_step=one_step, use_noise=use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_reseter[si - 1](state_below, one_step=one_step, use_noise=use_noise) if reseter: reseter += projg else: reseter = projg if not one_step: rval = add_rec_step[si](rval, nsteps=seqlen, batch_size=bs, mask=words_mask, gater_below=gater, reseter_below=reseter, one_step=one_step, init_state=init_state, use_noise=use_noise) else: rval = add_rec_step[si](rval, mask=words_mask, state_before=prev_val, gater_below=gater, reseter_below=reseter, one_step=one_step, init_state=init_state, use_noise=use_noise) return rval add_op = Operator(_add_op) # Target Sentence emb_t = MultiLayer(rng, n_in=state['nouts'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_t') emb_words_t = [] if state['rec_gating']: gater_words_t = [] if state['rec_reseting']: reseter_words_t = [] for si in xrange(state['decoder_stack']): emb_words_t.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_t_%d' % si)) if state['rec_gating']: gater_words_t.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='gater_words_t_%d' % si)) if state['rec_reseting']: reseter_words_t.append( MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='reseter_words_t_%d' % si)) proj_everything_t = [] if state['rec_gating']: gater_everything_t = [] if state['rec_reseting']: reseter_everything_t = [] for si in xrange(state['decoder_stack']): proj_everything_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='proj_everything_t_%d' % si, learn_bias=False)) if state['rec_gating']: gater_everything_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='gater_everything_t_%d' % si, learn_bias=False)) if state['rec_reseting']: reseter_everything_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='reseter_everything_t_%d' % si, learn_bias=False)) add_rec_step_t = [] rec_proj_t = [] if state['rec_gating']: rec_proj_t_gater = [] if state['rec_reseting']: rec_proj_t_reseter = [] for si in xrange(state['decoder_stack']): if si > 0: rec_proj_t.append( MultiLayer(rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d' % si)) if state['rec_gating']: rec_proj_t_gater.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_gater_%d' % si)) if state['rec_reseting']: rec_proj_t_reseter.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_reseter_%d' % si)) add_rec_step_t.append( eval(state['rec_layer'])(rng, n_hids=state['dim'], activation=state['activ'], bias_scale=state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_t_%d' % si)) if state['encoder_stack'] > 1: encoder_proj = [] for si in xrange(state['encoder_stack']): encoder_proj.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim'] * state['maxout_part']], activation=['lambda x: x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='encoder_proj_%d' % si, learn_bias=(si == 0))) encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']), indim=indim, pieces=pieces, rng=rng) def _add_t_op(words_embeddings, everything=None, words_mask=None, prev_val=None, one_step=False, bs=1, init_state=None, use_noise=True, gater_below=None, reseter_below=None, si=0, state_below=None): seqlen = words_embeddings.out.shape[0] // bs rval = words_embeddings gater = None if state['rec_gating']: gater = gater_below reseter = None if state['rec_reseting']: reseter = reseter_below if si > 0: if isinstance(state_below, list): state_below = state_below[-1] rval += rec_proj_t[si - 1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_t_gater[si - 1](state_below, one_step=one_step, use_noise=use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_t_reseter[si - 1](state_below, one_step=one_step, use_noise=use_noise) if reseter: reseter += projg else: reseter = projg if everything: rval = rval + proj_everything_t[si](everything) if state['rec_gating']: everyg = gater_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if gater: gater += everyg else: gater = everyg if state['rec_reseting']: everyg = reseter_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if reseter: reseter += everyg else: reseter = everyg if not one_step: rval = add_rec_step_t[si](rval, nsteps=seqlen, batch_size=bs, mask=words_mask, one_step=one_step, init_state=init_state, gater_below=gater, reseter_below=reseter, use_noise=use_noise) else: rval = add_rec_step_t[si](rval, mask=words_mask, state_before=prev_val, one_step=one_step, gater_below=gater, reseter_below=reseter, use_noise=use_noise) return rval add_t_op = Operator(_add_t_op) outdim = state['dim_mlp'] if not state['deep_out']: outdim = state['rank_n_approx'] if state['bias_code']: bias_code = [] for si in xrange(state['decoder_stack']): bias_code.append( MultiLayer(rng, n_in=state['dim'], n_hids=[state['dim']], activation=[state['activ']], bias_scale=[state['bias']], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='bias_code_%d' % si)) if state['avg_word']: word_code_nin = state['rank_n_approx'] word_code = MultiLayer(rng, n_in=word_code_nin, n_hids=[outdim], activation='lambda x:x', bias_scale=[state['bias_mlp'] / 3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias=False, name='word_code') proj_code = MultiLayer(rng, n_in=state['dim'], n_hids=[outdim], activation='lambda x: x', bias_scale=[state['bias_mlp'] / 3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias=False, name='proj_code') proj_h = [] for si in xrange(state['decoder_stack']): proj_h.append( MultiLayer(rng, n_in=state['dim'], n_hids=[outdim], activation='lambda x: x', bias_scale=[state['bias_mlp'] / 3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='proj_h_%d' % si)) if state['bigram']: proj_word = MultiLayer(rng, n_in=state['rank_n_approx'], n_hids=[outdim], activation=['lambda x:x'], bias_scale=[state['bias_mlp'] / 3], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='emb_words_lm') if state['deep_out']: indim = 0 pieces = 0 act_layer = UnaryOp(activation=eval(state['unary_activ'])) drop_layer = DropOp(rng=rng, dropout=state['dropout']) if state['deep_out']: indim = state['dim_mlp'] / state['maxout_part'] rank_n_approx = state['rank_n_approx'] rank_n_activ = state['rank_n_activ'] else: indim = state['rank_n_approx'] rank_n_approx = 0 rank_n_activ = None output_layer = SoftmaxLayer(rng, indim, state['nouts'], state['weight_scale'], -1, rank_n_approx=rank_n_approx, rank_n_activ=rank_n_activ, weight_noise=state['weight_noise'], init_fn=state['weight_init_fn'], name='out') def _pop_op(everything, accum, everything_max=None, everything_min=None, word=None, aword=None, one_step=False, use_noise=True): rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise) for si in xrange(1, state['decoder_stack']): rval += proj_h[si](accum[si], one_step=one_step, use_noise=use_noise) if state['mult_out']: rval = rval * everything else: rval = rval + everything if aword and state['avg_word']: wcode = aword if one_step: if state['mult_out']: rval = rval * wcode else: rval = rval + wcode else: if not isinstance(wcode, TT.TensorVariable): wcode = wcode.out shape = wcode.shape rshape = rval.shape rval = rval.reshape( [rshape[0] / shape[0], shape[0], rshape[1]]) wcode = wcode.dimshuffle('x', 0, 1) if state['mult_out']: rval = rval * wcode else: rval = rval + wcode rval = rval.reshape(rshape) if word and state['bigram']: if one_step: if state['mult_out']: rval *= proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: rval += proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: if isinstance(word, TT.TensorVariable): shape = word.shape ndim = word.ndim else: shape = word.shape ndim = word.out.ndim pword = proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) shape_pword = pword.shape if ndim == 1: pword = Shift()(pword.reshape([shape[0], 1, outdim])) else: pword = Shift()(pword.reshape([shape[0], shape[1], outdim])) if state['mult_out']: rval *= pword.reshape(shape_pword) else: rval += pword.reshape(shape_pword) if state['deep_out']: rval = drop_layer(act_layer(rval), use_noise=use_noise) return rval pop_op = Operator(_pop_op) # 3. Constructing the model gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [ add_op(emb_words[0](emb(x)), x_mask, bs=x_mask.shape[1], si=0, gater_below=gater_below, reseter_below=reseter_below) ] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1])) for si in xrange(1, state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append( add_op(emb_words[si](emb(x)), x_mask, bs=x_mask.shape[1], si=si, state_below=encoder_acts[-1], gater_below=gater_below, reseter_below=reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1])) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = LastState(ntimes=True, n=y.shape[0])(encoder) else: everything = encoder_act_layer(everything) everything = everything.reshape( [1, everything.shape[0], everything.shape[1]]) everything = LastState(ntimes=True, n=y.shape[0])(everything) if state['bias_code']: init_state = [bc(everything[-1]) for bc in bias_code] else: init_state = [None for bc in bias_code] if state['avg_word']: shape = x.shape pword = emb(x).out.reshape( [shape[0], shape[1], state['rank_n_approx']]) pword = pword * x_mask.dimshuffle(0, 1, 'x') aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x')) aword = word_code(aword, use_noise=False) else: aword = None gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(y0)) has_said = [ add_t_op(emb_words_t[0](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], gater_below=gater_below, reseter_below=reseter_below, init_state=init_state[0], si=0) ] for si in xrange(1, state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(y0)) has_said.append( add_t_op(emb_words_t[si](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], state_below=has_said[-1], gater_below=gater_below, reseter_below=reseter_below, init_state=init_state[si], si=si)) if has_said[0].out.ndim < 3: for si in xrange(state['decoder_stack']): shape_hs = has_said[si].shape if y0.ndim == 1: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape( [shape[0], 1, state['dim_mlp']])) else: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape( [shape[0], shape[1], state['dim_mlp']])) has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si]) has_said[si] = has_said[si].reshape(shape_hs) else: for si in xrange(state['decoder_stack']): has_said[si] = Shift()(has_said[si]) has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :], init_state[si]) model = pop_op(proj_code(everything), has_said, word=y0, aword=aword) nll = output_layer.train( state_below=model, target=y0, mask=y_mask, reg=None) / TT.cast( y.shape[0] * y.shape[1], 'float32') valid_fn = None noise_fn = None x = TT.lvector(name='x') n_steps = TT.iscalar('nsteps') temp = TT.scalar('temp') gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [ add_op(emb_words[0](emb(x), use_noise=False), si=0, use_noise=False, gater_below=gater_below, reseter_below=reseter_below) ] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False) for si in xrange(1, state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append( add_op(emb_words[si](emb(x), use_noise=False), si=si, state_below=encoder_acts[-1], use_noise=False, gater_below=gater_below, reseter_below=reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1]), use_noise=False) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = last(encoder) else: everything = encoder_act_layer(everything) init_state = [] for si in xrange(state['decoder_stack']): if state['bias_code']: init_state.append( TT.reshape(bias_code[si](everything, use_noise=False), [1, state['dim']])) else: init_state.append(TT.alloc(numpy.float32(0), 1, state['dim'])) if state['avg_word']: aword = emb(x, use_noise=False).out.mean(0) aword = word_code(aword, use_noise=False) else: aword = None def sample_fn(*args): aidx = 0 word_tm1 = args[aidx] aidx += 1 prob_tm1 = args[aidx] has_said_tm1 = [] for si in xrange(state['decoder_stack']): aidx += 1 has_said_tm1.append(args[aidx]) aidx += 1 ctx = args[aidx] if state['avg_word']: aidx += 1 awrd = args[aidx] val = pop_op(proj_code(ctx), has_said_tm1, word=word_tm1, aword=awrd, one_step=True, use_noise=False) sample = output_layer.get_sample(state_below=val, temp=temp) logp = output_layer.get_cost(state_below=val.out.reshape( [1, TT.cast(output_layer.n_in, 'int64')]), temp=temp, target=sample.reshape([1, 1]), use_noise=False) gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(sample)) has_said_t = [ add_t_op(emb_words_t[0](emb_t(sample)), ctx, prev_val=has_said_tm1[0], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=0) ] for si in xrange(1, state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(sample)) has_said_t.append( add_t_op(emb_words_t[si](emb_t(sample)), ctx, prev_val=has_said_tm1[si], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=si, state_below=has_said_t[-1])) for si in xrange(state['decoder_stack']): if isinstance(has_said_t[si], list): has_said_t[si] = has_said_t[si][-1] rval = [sample, TT.cast(logp, 'float32')] + has_said_t return rval sampler_params = [everything] if state['avg_word']: sampler_params.append(aword) states = [TT.alloc(numpy.int64(0), n_steps)] states.append(TT.alloc(numpy.float32(0), n_steps)) states += init_state outputs, updates = scan(sample_fn, states=states, params=sampler_params, n_steps=n_steps, name='sampler_scan') samples = outputs[0] probs = outputs[1] sample_fn = theano.function([n_steps, temp, x], [samples, probs.sum()], updates=updates, profile=False, name='sample_fn') model = LM_Model(cost_layer=nll, weight_noise_amount=state['weight_noise_amount'], valid_fn=valid_fn, sample_fn=sample_fn, clean_before_noise_fn=False, noise_fn=noise_fn, indx_word=state['indx_word_target'], indx_word_src=state['indx_word'], character_level=False, rng=rng) if state['loopIters'] > 0: algo = SGD(model, state, train_data) else: algo = None def hook_fn(): if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs old_offset = train_data.offset if state['sample_reset']: train_data.reset() ns = 0 for sidx in xrange(state['sample_n']): while True: batch = train_data.next() if batch: break x = batch['x'] y = batch['y'] #xbow = batch['x_bow'] masks = batch['x_mask'] if x.ndim > 1: for idx in xrange(x.shape[1]): ns += 1 if ns > state['sample_max']: break print 'Input: ', for k in xrange(x[:, idx].shape[0]): print model.word_indxs_src[x[:, idx][k]], if model.word_indxs_src[x[:, idx][k]] == '<eol>': break print '' print 'Target: ', for k in xrange(y[:, idx].shape[0]): print model.word_indxs[y[:, idx][k]], if model.word_indxs[y[:, idx][k]] == '<eol>': break print '' senlen = len(x[:, idx]) if len(numpy.where(masks[:, idx] == 0)[0]) > 0: senlen = numpy.where(masks[:, idx] == 0)[0][0] if senlen < 1: continue xx = x[:senlen, idx] #xx = xx.reshape([xx.shape[0], 1]) model.get_samples(state['seqlen'] + 1, 1, xx) else: ns += 1 model.get_samples(state['seqlen'] + 1, 1, x) if ns > state['sample_max']: break train_data.offset = old_offset return main = MainLoop(train_data, valid_data, None, model, algo, state, channel, reset=state['reset'], hooks=hook_fn) if state['reload']: main.load() if state['loopIters'] > 0: main.main() if state['sampler_test']: # This is a test script: we only sample if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs indx_word = pkl.load(open(state['word_indx'], 'rb')) try: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = float(raw_input('Inverse Temperature? ')) seqin = seqin.lower() seqin = seqin.split() seqlen = len(seqin) seq = numpy.zeros(seqlen + 1, dtype='int64') for idx, sx in enumerate(seqin): try: seq[idx] = indx_word[sx] except: seq[idx] = indx_word[state['oov']] seq[-1] = state['null_sym_source'] except Exception: print 'Something wrong with your input! Try again!' continue sentences = [] all_probs = [] for sidx in xrange(n_samples): #import ipdb; ipdb.set_trace() [values, probs] = model.sample_fn(seqlen * 3, alpha, seq) sen = [] for k in xrange(values.shape[0]): if model.word_indxs[values[k]] == '<eol>': break sen.append(model.word_indxs[values[k]]) sentences.append(" ".join(sen)) all_probs.append(-probs) sprobs = numpy.argsort(all_probs) for pidx in sprobs: print pidx, "(%f):" % (-all_probs[pidx]), sentences[pidx] print except KeyboardInterrupt: print 'Interrupted' pass
def main(): args = parse_args() state = getattr(experiments.nmt, args.proto)() if args.state: if args.state.endswith(".py"): state.update(eval(open(args.state).read())) else: with open(args.state) as src: state.update(cPickle.load(src)) for change in args.changes: state.update(eval("dict({})".format(change))) logging.basicConfig( level=getattr(logging, state['level']), format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("State:\n{}".format(pprint.pformat(state))) if 'rolling_vocab' not in state: state['rolling_vocab'] = 0 if 'save_algo' not in state: state['save_algo'] = 0 if 'save_gs' not in state: state['save_gs'] = 0 if 'fixed_embeddings' not in state: state['fixed_embeddings'] = False if 'save_iter' not in state: state['save_iter'] = -1 if 'var_src_len' not in state: state['var_src_len'] = False rng = numpy.random.RandomState(state['seed']) enc_dec = RNNEncoderDecoder(state, rng, args.skip_init) enc_dec.build() lm_model = enc_dec.create_lm_model() logger.debug("Load data") train_data = get_batch_iterator(state, rng) logger.debug("Compile trainer") algo = eval(state['algo'])(lm_model, state, train_data) if state['rolling_vocab']: logger.debug("Initializing extra parameters") init_extra_parameters(lm_model, state) if not state['fixed_embeddings']: init_adadelta_extra_parameters(algo, state) with open(state['rolling_vocab_dict'], 'rb') as f: lm_model.rolling_vocab_dict = cPickle.load(f) lm_model.total_num_batches = max(lm_model.rolling_vocab_dict) lm_model.Dx_shelve = shelve.open(state['Dx_file']) lm_model.Dy_shelve = shelve.open(state['Dy_file']) logger.debug("Run training") main = MainLoop(train_data, None, None, lm_model, algo, state, None, reset=state['reset'], hooks=[RandomSamplePrinter(state, lm_model, train_data)] if state['hookFreq'] >= 0 else None) if state['reload']: main.load() if state['loopIters'] > 0: main.main() if state['rolling_vocab']: lm_model.Dx_shelve.close() lm_model.Dy_shelve.close()
def do_experiment(state, channel): logging.basicConfig(level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") logger.debug("Starting state: {}".format(state)) def maxout(x): shape = x.shape if x.ndim == 1: shape1 = TT.cast(shape[0] / state['maxout_part'], 'int64') shape2 = TT.cast(state['maxout_part'], 'int64') x = x.reshape([shape1, shape2]) x = x.max(1) else: shape1 = TT.cast(shape[1] / state['maxout_part'], 'int64') shape2 = TT.cast(state['maxout_part'], 'int64') x = x.reshape([shape[0], shape1, shape2]) x = x.max(2) return x logger.info("Start loading") rng = numpy.random.RandomState(state['seed']) train_data, valid_data, test_data = get_data(state) logger.info("Build layers") if state['bs'] == 1: x = TT.lvector('x') x_mask = TT.vector('x_mask') y = TT.lvector('y') y0 = y y_mask = TT.vector('y_mask') else: x = TT.lmatrix('x') x_mask = TT.matrix('x_mask') y = TT.lmatrix('y') y0 = y y_mask = TT.matrix('y_mask') scoring_inputs = [x, x_mask, y, y_mask] bs = state['bs'] # Dimensionality of word embedings. # The same as state['dim'] and in fact equals the number of hidden units. embdim = state['dim_mlp'] logger.info("Source sentence") # Low-rank embeddings emb = MultiLayer( rng, n_in=state['nins'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb') emb_words = [] if state['rec_gating']: gater_words = [] if state['rec_reseting']: reseter_words = [] # si always stands for the number in stack of RNNs (which is actually 1) for si in xrange(state['encoder_stack']): # In paper it is multiplication by W emb_words.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_%d'%si)) # In paper it is multiplication by W_z if state['rec_gating']: gater_words.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='gater_words_%d'%si)) # In paper it is multiplication by W_r if state['rec_reseting']: reseter_words.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='reseter_words_%d'%si)) add_rec_step = [] rec_proj = [] if state['rec_gating']: rec_proj_gater = [] if state['rec_reseting']: rec_proj_reseter = [] for si in xrange(state['encoder_stack']): if si > 0: rec_proj.append(MultiLayer( rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d'%si)) if state['rec_gating']: rec_proj_gater.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_gater_%d'%si)) if state['rec_reseting']: rec_proj_reseter.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_reseter_%d'%si)) # This should be U from paper add_rec_step.append(eval(state['rec_layer'])( rng, n_hids=state['dim'], activation = state['activ'], bias_scale = state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_%d'%si)) def _add_op(words_embeddings, words_mask=None, prev_val=None, si = 0, state_below = None, gater_below = None, reseter_below = None, one_step=False, bs=1, init_state=None, use_noise=True): seqlen = words_embeddings.out.shape[0]//bs rval = words_embeddings gater = None reseter = None if state['rec_gating']: gater = gater_below if state['rec_reseting']: reseter = reseter_below if si > 0: rval += rec_proj[si-1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_gater[si-1](state_below, one_step=one_step, use_noise = use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_reseter[si-1](state_below, one_step=one_step, use_noise = use_noise) if reseter: reseter += projg else: reseter = projg if not one_step: rval= add_rec_step[si]( rval, nsteps=seqlen, batch_size=bs, mask=words_mask, gater_below = gater, reseter_below = reseter, one_step=one_step, init_state=init_state, use_noise = use_noise) else: #Here we link the Encoder part rval= add_rec_step[si]( rval, mask=words_mask, state_before=prev_val, gater_below = gater, reseter_below = reseter, one_step=one_step, init_state=init_state, use_noise = use_noise) return rval add_op = Operator(_add_op) logger.info("Target sequence") emb_t = MultiLayer( rng, n_in=state['nouts'], n_hids=[state['rank_n_approx']], activation=[state['rank_n_activ']], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_t') emb_words_t = [] if state['rec_gating']: gater_words_t = [] if state['rec_reseting']: reseter_words_t = [] for si in xrange(state['decoder_stack']): emb_words_t.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='emb_words_t_%d'%si)) if state['rec_gating']: gater_words_t.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='gater_words_t_%d'%si)) if state['rec_reseting']: reseter_words_t.append(MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='reseter_words_t_%d'%si)) proj_everything_t = [] if state['rec_gating']: gater_everything_t = [] if state['rec_reseting']: reseter_everything_t = [] for si in xrange(state['decoder_stack']): # This stands for the matrix C from the text proj_everything_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='proj_everything_t_%d'%si, learn_bias = False)) if state['rec_gating']: gater_everything_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='gater_everything_t_%d'%si, learn_bias = False)) if state['rec_reseting']: reseter_everything_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='reseter_everything_t_%d'%si, learn_bias = False)) add_rec_step_t = [] rec_proj_t = [] if state['rec_gating']: rec_proj_t_gater = [] if state['rec_reseting']: rec_proj_t_reseter = [] for si in xrange(state['decoder_stack']): if si > 0: rec_proj_t.append(MultiLayer( rng, n_in=state['dim'], n_hids=[embdim], activation=['lambda x:x'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise'], scale=state['rec_weight_scale'], name='rec_proj_%d'%si)) if state['rec_gating']: rec_proj_t_gater.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_gater_%d'%si)) if state['rec_reseting']: rec_proj_t_reseter.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation=['lambda x:x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias=False, name='rec_proj_t_reseter_%d'%si)) # This one stands for gating, resetting and applying non-linearity in Decoder add_rec_step_t.append(eval(state['rec_layer'])( rng, n_hids=state['dim'], activation = state['activ'], bias_scale = state['bias'], scale=state['rec_weight_scale'], init_fn=state['rec_weight_init_fn'], weight_noise=state['weight_noise_rec'], dropout=state['dropout_rec'], gating=state['rec_gating'], gater_activation=state['rec_gater'], reseting=state['rec_reseting'], reseter_activation=state['rec_reseter'], name='add_h_t_%d'%si)) if state['encoder_stack'] > 1: encoder_proj = [] for si in xrange(state['encoder_stack']): encoder_proj.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim'] * state['maxout_part']], activation=['lambda x: x'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], name='encoder_proj_%d'%si, learn_bias = (si == 0))) encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']), indim=indim, pieces=pieces, rng=rng) # Actually add target opp def _add_t_op(words_embeddings, everything = None, words_mask=None, prev_val=None,one_step=False, bs=1, init_state=None, use_noise=True, gater_below = None, reseter_below = None, si = 0, state_below = None): seqlen = words_embeddings.out.shape[0]//bs rval = words_embeddings gater = None if state['rec_gating']: gater = gater_below reseter = None if state['rec_reseting']: reseter = reseter_below if si > 0: if isinstance(state_below, list): state_below = state_below[-1] rval += rec_proj_t[si-1](state_below, one_step=one_step, use_noise=use_noise) if state['rec_gating']: projg = rec_proj_t_gater[si-1](state_below, one_step=one_step, use_noise = use_noise) if gater: gater += projg else: gater = projg if state['rec_reseting']: projg = rec_proj_t_reseter[si-1](state_below, one_step=one_step, use_noise = use_noise) if reseter: reseter += projg else: reseter = projg if everything: rval = rval + proj_everything_t[si](everything) if state['rec_gating']: everyg = gater_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if gater: gater += everyg else: gater = everyg if state['rec_reseting']: everyg = reseter_everything_t[si](everything, one_step=one_step, use_noise=use_noise) if reseter: reseter += everyg else: reseter = everyg if not one_step: rval = add_rec_step_t[si]( rval, nsteps=seqlen, batch_size=bs, mask=words_mask, one_step=one_step, init_state=init_state, gater_below=gater, reseter_below=reseter, use_noise=use_noise) else: # Here we link the Decoder part rval = add_rec_step_t[si]( rval, mask=words_mask, state_before=prev_val, one_step=one_step, gater_below=gater, reseter_below=reseter, use_noise=use_noise) return rval add_t_op = Operator(_add_t_op) outdim = state['dim_mlp'] if not state['deep_out']: outdim = state['rank_n_approx'] if state['bias_code']: bias_code = [] for si in xrange(state['decoder_stack']): bias_code.append(MultiLayer( rng, n_in=state['dim'], n_hids=[state['dim']], activation = [state['activ']], bias_scale = [state['bias']], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='bias_code_%d'%si)) if state['avg_word']: word_code_nin = state['rank_n_approx'] word_code = MultiLayer( rng, n_in=word_code_nin, n_hids=[outdim], activation = 'lambda x:x', bias_scale = [state['bias_mlp']/3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias = False, name='word_code') proj_code = MultiLayer( rng, n_in=state['dim'], n_hids=[outdim], activation = 'lambda x: x', bias_scale = [state['bias_mlp']/3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], learn_bias = False, name='proj_code') proj_h = [] for si in xrange(state['decoder_stack']): proj_h.append(MultiLayer( rng, n_in=state['dim'], n_hids=[outdim], activation = 'lambda x: x', bias_scale = [state['bias_mlp']/3], scale=state['weight_scale'], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], name='proj_h_%d'%si)) if state['bigram']: proj_word = MultiLayer( rng, n_in=state['rank_n_approx'], n_hids=[outdim], activation=['lambda x:x'], bias_scale = [state['bias_mlp']/3], init_fn=state['weight_init_fn'], weight_noise=state['weight_noise'], scale=state['weight_scale'], learn_bias = False, name='emb_words_lm') if state['deep_out']: indim = 0 pieces = 0 act_layer = UnaryOp(activation=eval(state['unary_activ'])) drop_layer = DropOp(rng=rng, dropout=state['dropout']) if state['deep_out']: indim = state['dim_mlp'] / state['maxout_part'] rank_n_approx = state['rank_n_approx'] rank_n_activ = state['rank_n_activ'] else: indim = state['rank_n_approx'] rank_n_approx = 0 rank_n_activ = None output_layer = SoftmaxLayer( rng, indim, state['nouts'], state['weight_scale'], -1, rank_n_approx = rank_n_approx, rank_n_activ = rank_n_activ, weight_noise=state['weight_noise'], init_fn=state['weight_init_fn'], name='out') def _pop_op(everything, accum, everything_max = None, everything_min = None, word = None, aword = None, one_step=False, use_noise=True): rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise) for si in xrange(1,state['decoder_stack']): rval += proj_h[si](accum[si], one_step=one_step, use_noise=use_noise) if state['mult_out']: rval = rval * everything else: rval = rval + everything if aword and state['avg_word']: wcode = aword if one_step: if state['mult_out']: rval = rval * wcode else: rval = rval + wcode else: if not isinstance(wcode, TT.TensorVariable): wcode = wcode.out shape = wcode.shape rshape = rval.shape rval = rval.reshape([rshape[0]/shape[0], shape[0], rshape[1]]) wcode = wcode.dimshuffle('x', 0, 1) if state['mult_out']: rval = rval * wcode else: rval = rval + wcode rval = rval.reshape(rshape) if word and state['bigram']: if one_step: if state['mult_out']: rval *= proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: rval += proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) else: if isinstance(word, TT.TensorVariable): shape = word.shape ndim = word.ndim else: shape = word.shape ndim = word.out.ndim pword = proj_word(emb_t(word, use_noise=use_noise), one_step=one_step, use_noise=use_noise) shape_pword = pword.shape if ndim == 1: pword = Shift()(pword.reshape([shape[0], 1, outdim])) else: pword = Shift()(pword.reshape([shape[0], shape[1], outdim])) if state['mult_out']: rval *= pword.reshape(shape_pword) else: rval += pword.reshape(shape_pword) if state['deep_out']: rval = drop_layer(act_layer(rval), use_noise=use_noise) return rval pop_op = Operator(_pop_op) logger.info("Construct the model") gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [add_op(emb_words[0](emb(x)), x_mask, bs=x_mask.shape[1], si=0, gater_below=gater_below, reseter_below=reseter_below)] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1])) for si in xrange(1,state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append(add_op(emb_words[si](emb(x)), x_mask, bs=x_mask.shape[1], si=si, state_below=encoder_acts[-1], gater_below=gater_below, reseter_below=reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1])) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = LastState(ntimes=True,n=y.shape[0])(encoder) else: everything = encoder_act_layer(everything) everything = everything.reshape([1, everything.shape[0], everything.shape[1]]) everything = LastState(ntimes=True,n=y.shape[0])(everything) if state['bias_code']: init_state = [bc(everything[-1]) for bc in bias_code] else: init_state = [None for bc in bias_code] if state['avg_word']: shape = x.shape pword = emb(x).out.reshape([shape[0], shape[1], state['rank_n_approx']]) pword = pword * x_mask.dimshuffle(0, 1, 'x') aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x')) aword = word_code(aword, use_noise=False) else: aword = None gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(y0)) has_said = [add_t_op(emb_words_t[0](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], gater_below = gater_below, reseter_below = reseter_below, init_state=init_state[0], si=0)] for si in xrange(1,state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(y0)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(y0)) has_said.append(add_t_op(emb_words_t[si](emb_t(y0)), everything, y_mask, bs=y_mask.shape[1], state_below = has_said[-1], gater_below = gater_below, reseter_below = reseter_below, init_state=init_state[si], si=si)) # has_said are hidden layer states if has_said[0].out.ndim < 3: for si in xrange(state['decoder_stack']): shape_hs = has_said[si].shape if y0.ndim == 1: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape([shape[0], 1, state['dim_mlp']])) else: shape = y0.shape has_said[si] = Shift()(has_said[si].reshape([shape[0], shape[1], state['dim_mlp']])) has_said[si] = TT.set_subtensor(has_said[si][0, :, :], init_state[si]) has_said[si] = has_said[si].reshape(shape_hs) else: for si in xrange(state['decoder_stack']): has_said[si] = Shift()(has_said[si]) has_said[si].out = TT.set_subtensor(has_said[si][0, :, :], init_state[si]) model = pop_op(proj_code(everything), has_said, word=y0, aword = aword) nll = output_layer.train(state_below=model, target=y0, mask=y_mask, reg=None) / TT.cast(y.shape[0]*y.shape[1], 'float32') valid_fn = None noise_fn = None x = TT.lvector(name='x') n_steps = TT.iscalar('nsteps') temp = TT.scalar('temp') gater_below = None if state['rec_gating']: gater_below = gater_words[0](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[0](emb(x)) encoder_acts = [add_op(emb_words[0](emb(x),use_noise=False), si=0, use_noise=False, gater_below=gater_below, reseter_below=reseter_below)] if state['encoder_stack'] > 1: everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False) for si in xrange(1,state['encoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words[si](emb(x)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words[si](emb(x)) encoder_acts.append(add_op(emb_words[si](emb(x),use_noise=False), si=si, state_below=encoder_acts[-1], use_noise=False, gater_below = gater_below, reseter_below = reseter_below)) if state['encoder_stack'] > 1: everything += encoder_proj[si](last(encoder_acts[-1]), use_noise=False) if state['encoder_stack'] <= 1: encoder = encoder_acts[-1] everything = last(encoder) else: everything = encoder_act_layer(everything) init_state = [] for si in xrange(state['decoder_stack']): if state['bias_code']: init_state.append(TT.reshape(bias_code[si](everything, use_noise=False), [1, state['dim']])) else: init_state.append(TT.alloc(numpy.float32(0), 1, state['dim'])) if state['avg_word']: aword = emb(x,use_noise=False).out.mean(0) aword = word_code(aword, use_noise=False) else: aword = None def sample_fn(*args): aidx = 0; word_tm1 = args[aidx] aidx += 1; prob_tm1 = args[aidx] has_said_tm1 = [] for si in xrange(state['decoder_stack']): aidx += 1; has_said_tm1.append(args[aidx]) aidx += 1; ctx = args[aidx] if state['avg_word']: aidx += 1; awrd = args[aidx] else: awrd = None val = pop_op(proj_code(ctx), has_said_tm1, word=word_tm1, aword=awrd, one_step=True, use_noise=False) sample = output_layer.get_sample(state_below=val, temp=temp) logp = output_layer.get_cost( state_below=val.out.reshape([1, TT.cast(output_layer.n_in, 'int64')]), temp=temp, target=sample.reshape([1,1]), use_noise=False) gater_below = None if state['rec_gating']: gater_below = gater_words_t[0](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[0](emb_t(sample)) has_said_t = [add_t_op(emb_words_t[0](emb_t(sample)), ctx, prev_val=has_said_tm1[0], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=0)] for si in xrange(1, state['decoder_stack']): gater_below = None if state['rec_gating']: gater_below = gater_words_t[si](emb_t(sample)) reseter_below = None if state['rec_reseting']: reseter_below = reseter_words_t[si](emb_t(sample)) has_said_t.append(add_t_op(emb_words_t[si](emb_t(sample)), ctx, prev_val=has_said_tm1[si], gater_below=gater_below, reseter_below=reseter_below, one_step=True, use_noise=True, si=si, state_below=has_said_t[-1])) for si in xrange(state['decoder_stack']): if isinstance(has_said_t[si], list): has_said_t[si] = has_said_t[si][-1] rval = [sample, TT.cast(logp, 'float32')] + has_said_t return rval sampler_params = [everything] if state['avg_word']: sampler_params.append(aword) states = [TT.alloc(numpy.int64(0), n_steps)] states.append(TT.alloc(numpy.float32(0), n_steps)) states += init_state outputs, updates = scan(sample_fn, states = states, params = sampler_params, n_steps= n_steps, name='sampler_scan' ) samples = outputs[0] probs = outputs[1] sample_fn = theano.function( [n_steps, temp, x], [samples, probs.sum()], updates=updates, profile=False, name='sample_fn') model = LM_Model( cost_layer=nll, weight_noise_amount=state['weight_noise_amount'], valid_fn=valid_fn, sample_fn=sample_fn, clean_before_noise_fn = False, noise_fn=noise_fn, indx_word=state['indx_word_target'], indx_word_src=state['indx_word'], character_level=False, rng=rng) algo = SGD(model, state, train_data) def hook_fn(): if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs old_offset = train_data.offset if state['sample_reset']: train_data.reset() ns = 0 for sidx in xrange(state['sample_n']): while True: batch = train_data.next() if batch: break x = batch['x'] y = batch['y'] #xbow = batch['x_bow'] masks = batch['x_mask'] if x.ndim > 1: for idx in xrange(x.shape[1]): ns += 1 if ns > state['sample_max']: break print 'Input: ', for k in xrange(x[:,idx].shape[0]): print model.word_indxs_src[x[:,idx][k]], if model.word_indxs_src[x[:,idx][k]] == '<eol>': break print '' print 'Target: ', for k in xrange(y[:,idx].shape[0]): print model.word_indxs[y[:,idx][k]], if model.word_indxs[y[:,idx][k]] == '<eol>': break print '' senlen = len(x[:,idx]) if len(numpy.where(masks[:,idx]==0)[0]) > 0: senlen = numpy.where(masks[:,idx]==0)[0][0] if senlen < 1: continue xx = x[:senlen, idx] #xx = xx.reshape([xx.shape[0], 1]) model.get_samples(state['seqlen']+1, 1, xx) else: ns += 1 model.get_samples(state['seqlen']+1, 1, x) if ns > state['sample_max']: break train_data.offset = old_offset return main = MainLoop(train_data, valid_data, None, model, algo, state, channel, reset = state['reset'], hooks = hook_fn) if state['reload']: main.load() main.main() if state["scoring"]: score_file = open(state["score_file"], "w") logger.info("Compiling score function") score_fn = theano.function(scoring_inputs, [-nll.cost_per_sample]) count = 0 n_samples = 0 logger.info('Scoring phrases') for batch in train_data: if batch == None: continue if batch['x'].shape[0] <= 0 or \ batch['x_mask'].shape[0] <= 0 or \ batch['y'].shape[0] <= 0 or \ batch['y_mask'].shape[0] <= 0: logger.error('Wrong batch!!!') continue st = time.time() [scores] = score_fn(batch['x'], batch['x_mask'], batch['y'], batch['y_mask']) up_time = time.time() - st for s in scores: print >>score_file, "{:.5f}".format(float(s)) n_samples += batch['x'].shape[1] count += 1 if state['flush_scores'] >= 1 and count % state['flush_scores'] == 0: score_file.flush() logger.debug("Scores flushed") logger.debug("{} batches, {} samples, {} per sample; example scores: {}".format( count, n_samples, up_time/scores.shape[0], scores[:5])) logger.info("Done") score_file.flush() score_file.close() if state['sampler_test']: # This is a test script: we only sample if not hasattr(model, 'word_indxs'): model.load_dict() if not hasattr(model, 'word_indxs_src'): model.word_indxs_src = model.word_indxs indx_word=pkl.load(open(state['word_indx'],'rb')) try: while True: try: seqin = raw_input('Input Sequence: ') n_samples = int(raw_input('How many samples? ')) alpha = float(raw_input('Inverse Temperature? ')) seqin = seqin.lower() seqin = seqin.split() seqlen = len(seqin) seq = numpy.zeros(seqlen+1, dtype='int64') for idx,sx in enumerate(seqin): try: seq[idx] = indx_word[sx] except: seq[idx] = indx_word[state['oov']] seq[-1] = state['null_sym_source'] except Exception: print 'Something wrong with your input! Try again!' continue sentences = [] all_probs = [] for sidx in xrange(n_samples): #import ipdb; ipdb.set_trace() [values, probs] = model.sample_fn(seqlen * 3, alpha, seq) sen = [] for k in xrange(values.shape[0]): if model.word_indxs[values[k]] == '<eol>': break sen.append(model.word_indxs[values[k]]) sentences.append(" ".join(sen)) all_probs.append(-probs) sprobs = numpy.argsort(all_probs) for pidx in sprobs: print pidx,"(%f):"%(-all_probs[pidx]),sentences[pidx] print except KeyboardInterrupt: print 'Interrupted' pass