コード例 #1
0
    def setup_encode(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()
        # dimensions: (batch, time)
        chord_roots = T.imatrix()
        # dimensions: (batch, time)
        relative_posns = [T.imatrix() for _ in self.encodings]
        # dimesions: (batch, time, output_data)
        encoded_melodies = [T.btensor3() for _ in self.encodings]
        n_batch, n_time = chord_roots.shape

        all_activations = []
        for encoding, enc_lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.enc_lstmstacks, encoded_melodies, relative_posns):
            activations = enc_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) ,
                                                        relative_position=relative_pos,
                                                        cur_chord_type=chord_types,
                                                        cur_chord_root=chord_roots,
                                                        cur_input=encoded_melody,
                                                        deterministic_dropout=True )
            all_activations.append(activations)
        reduced_activations = functools.reduce((lambda x,y: x+y), all_activations)
        strengths, vects = self.qman.get_strengths_and_vects(reduced_activations)

        self.encode_fun = theano.function(
            inputs=[chord_types, chord_roots] + relative_posns + encoded_melodies,
            outputs=[strengths, vects],
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
コード例 #2
0
def main(config, tr_stream):
    # Create Theano variables
    logger.info('Creating theano variables')
    source_char_seq = tensor.lmatrix('source_char_seq')
    source_sample_matrix = tensor.btensor3('source_sample_matrix')
    source_char_aux = tensor.bmatrix('source_char_aux')
    source_word_mask = tensor.bmatrix('source_word_mask')
    target_char_seq = tensor.lmatrix('target_char_seq')
    target_char_aux = tensor.bmatrix('target_char_aux')
    target_char_mask = tensor.bmatrix('target_char_mask')
    target_sample_matrix = tensor.btensor3('target_sample_matrix')
    target_word_mask = tensor.bmatrix('target_word_mask')
    target_resample_matrix = tensor.btensor3('target_resample_matrix')
    target_prev_char_seq = tensor.lmatrix('target_prev_char_seq')
    target_prev_char_aux = tensor.bmatrix('target_prev_char_aux')
    target_bos_idx = tr_stream.trg_bos
    target_space_idx = tr_stream.space_idx['target']
    src_vocab = pickle.load(open(config['src_vocab'], 'rb'))

    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'],
                                   config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth'])

    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'],
                      config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'],
                      config['trg_dgru_depth'], target_space_idx, target_bos_idx)

    representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux,
                                   source_word_mask)
    cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix,
                        target_resample_matrix, target_char_aux, target_char_mask,
                        target_word_mask, target_prev_char_seq, target_prev_char_aux)

    # Set up model
    logger.info("Building model")
    training_model = Model(cost)

    # Set extensions
    logger.info("Initializing extensions")
    # Reload model if necessary
    extensions = [LoadNMT(config['saveto'])]

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=training_model,
        algorithm=None,
        data_stream=None,
        extensions=extensions
    )

    for extension in main_loop.extensions:
        extension.main_loop = main_loop
    main_loop._run_extensions('before_training')

    char_embedding = encoder.decimator.apply(source_char_seq.T, source_sample_matrix, source_char_aux.T)
    embedding(Model(char_embedding), src_vocab)
コード例 #3
0
def make_functions(
        input_size, output_size, mem_size, mem_width, hidden_sizes=[100]):

    start_time = time.time()

    input_seqs  = T.btensor3('input_sequences')
    output_seqs = T.btensor3('output_sequences')

    P = Parameters()
    process = model.build(P,
            input_size, output_size, mem_size, mem_width, hidden_sizes[0])
    outputs = process(T.cast(input_seqs,'float32'))
    output_length = (input_seqs.shape[1] - 2) // 2

    Y = output_seqs[:,-output_length:,:-2]
    Y_hat = T.nnet.sigmoid(outputs[:,-output_length:,:-2])

    cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat,Y))
    bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2)

    params = P.values()

    cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params)

    print "Computing gradients",
    grads = T.grad(cost, wrt=params)
    grads = updates.clip_deltas(grads, np.float32(clip_length))

    print "Done. (%0.3f s)"%(time.time() - start_time)
    start_time = time.time()
    print "Compiling function",
    P_learn = Parameters()

    update_pairs = updates.rmsprop(
                params, grads,
                learning_rate=1e-4,
                P=P_learn
            )

    train = theano.function(
            inputs=[input_seqs, output_seqs],
            outputs=cross_entropy,
            updates=update_pairs,
        )

    test = theano.function(
            inputs=[input_seqs, output_seqs],
            outputs=bits_loss
        )

    print "Done. (%0.3f s)"%(time.time() - start_time)
    print P.parameter_count()
    return P, P_learn, train, test
コード例 #4
0
def make_functions(input_size,
                   output_size,
                   mem_size,
                   mem_width,
                   hidden_sizes=[100]):

    start_time = time.time()

    input_seqs = T.btensor3('input_sequences')
    output_seqs = T.btensor3('output_sequences')

    P = Parameters()
    process = model.build(P, input_size, output_size, mem_size, mem_width,
                          hidden_sizes[0])
    outputs = process(T.cast(input_seqs, 'float32'))
    output_length = (input_seqs.shape[1] - 2) // 2

    Y = output_seqs[:, -output_length:, :-2]
    Y_hat = T.nnet.sigmoid(outputs[:, -output_length:, :-2])

    cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat, Y))
    bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2)

    params = P.values()

    cost = cross_entropy  # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params)

    print "Computing gradients",
    grads = T.grad(cost, wrt=params)
    grads = updates.clip_deltas(grads, np.float32(clip_length))

    print "Done. (%0.3f s)" % (time.time() - start_time)
    start_time = time.time()
    print "Compiling function",
    P_learn = Parameters()

    update_pairs = updates.rmsprop(params,
                                   grads,
                                   learning_rate=1e-4,
                                   P=P_learn)

    train = theano.function(
        inputs=[input_seqs, output_seqs],
        outputs=cross_entropy,
        updates=update_pairs,
    )

    test = theano.function(inputs=[input_seqs, output_seqs], outputs=bits_loss)

    print "Done. (%0.3f s)" % (time.time() - start_time)
    print P.parameter_count()
    return P, P_learn, train, test
コード例 #5
0
    def setup_train(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()

        # dimensions: (batch, time)
        chord_roots = T.imatrix()

        # dimensions: (batch, time)
        relative_pos = T.imatrix()

        # dimesions: (batch, time, output_data)
        encoded_melody = T.btensor3()

        # dimesions: (batch, time)
        correct_notes = T.imatrix()

        n_batch, n_time = relative_pos.shape

        def _build(det_dropout):
            activations = self.lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) ,
                                                             relative_position=relative_pos,
                                                             cur_chord_type=chord_types,
                                                             cur_chord_root=chord_roots,
                                                             last_output=T.concatenate([T.tile(self.encoding.initial_encoded_form(), (n_batch,1,1)),
                                                                                   encoded_melody[:,:-1,:] ], 1),
                                                             deterministic_dropout=det_dropout)

            out_probs = self.encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound)
            return Encoding.compute_loss(out_probs, correct_notes, True)

        train_loss, train_info = _build(False)
        updates = Adam(train_loss, self.params, lr=self.learning_rate_var)

        eval_loss, eval_info = _build(True)

        self.loss_info_keys = list(train_info.keys())

        self.update_fun = theano.function(
            inputs=[chord_types, chord_roots, relative_pos, encoded_melody, correct_notes],
            outputs=[train_loss]+list(train_info.values()),
            updates=updates,
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))

        self.eval_fun = theano.function(
            inputs=[chord_types, chord_roots, relative_pos, encoded_melody, correct_notes],
            outputs=[eval_loss]+list(eval_info.values()),
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
コード例 #6
0
    def __init__(self, num_chars, char_dim, max_word_len, embed_dim):
        self.num_chars = num_chars
        self.char_dim = char_dim
        self.max_word_len = max_word_len
        self.embed_dim = embed_dim

        chars1, chars2 = T.itensor3(), T.itensor3()
        mask1, mask2 = T.btensor3(), T.btensor3()
        self.inps = [chars1, chars2, mask1, mask2]
        l_e1, l_e2 = self.build_network()

        self.fn = theano.function(
            self.inps,
            [L.get_output(l_e1), L.get_output(l_e2)])
コード例 #7
0
    def setup_generate(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()

        # dimensions: (batch, time)
        chord_roots = T.imatrix()

        n_batch, n_time = chord_roots.shape

        specs = [lstmstack.prepare_sample_scan(  start_pos=T.alloc(np.array(encoding.STARTING_POSITION, np.int32), (n_batch)),
                                                    start_out=T.tile(encoding.initial_encoded_form(), (n_batch,1)),
                                                    timestep=T.tile(T.arange(n_time), (n_batch,1)),
                                                    cur_chord_type=chord_types,
                                                    cur_chord_root=chord_roots,
                                                    deterministic_dropout=True )
                    for lstmstack, encoding in zip(self.lstmstacks, self.encodings)]

        updates, all_chosen, all_probs, indiv_probs = helper_generate_from_spec(specs, self.lstmstacks, self.encodings, self.srng, n_batch, n_time, self.bounds, self.normalize_artic_only)

        self.generate_fun = theano.function(
            inputs=[chord_roots, chord_types],
            updates=updates,
            outputs=all_chosen,
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))

        self.generate_visualize_fun = theano.function(
            inputs=[chord_roots, chord_types],
            updates=updates,
            outputs=[all_chosen, all_probs] + indiv_probs,
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
コード例 #8
0
ファイル: theano_utils.py プロジェクト: pcyin/nn
def ndim_btensor(ndim, name=None):
    if ndim == 2:
        return T.bmatrix(name)
    elif ndim == 3:
        return T.btensor3(name)
    elif ndim == 4:
        return T.btensor4(name)
    return T.imatrix(name)
コード例 #9
0
ファイル: theano_utils.py プロジェクト: chubbymaggie/NL2code
def ndim_btensor(ndim, name=None):
    if ndim == 2:
        return T.bmatrix(name)
    elif ndim == 3:
        return T.btensor3(name)
    elif ndim == 4:
        return T.btensor4(name)
    return T.imatrix(name)
コード例 #10
0
    def setup_generate(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()

        # dimensions: (batch, time)
        chord_roots = T.imatrix()

        n_batch, n_time = chord_roots.shape

        spec = self.lstmstack.prepare_sample_scan(  start_pos=T.alloc(np.array(self.encoding.STARTING_POSITION, np.int32), (n_batch)),
                                                    start_out=T.tile(self.encoding.initial_encoded_form(), (n_batch,1)),
                                                    timestep=T.tile(T.arange(n_time), (n_batch,1)),
                                                    cur_chord_type=chord_types,
                                                    cur_chord_root=chord_roots,
                                                    deterministic_dropout=True )

        def _scan_fn(*inputs):
            # inputs is [ spec_sequences..., last_absolute_position, spec_taps..., spec_non_sequences... ]
            inputs = list(inputs)
            last_absolute_chosen = inputs.pop(len(spec.sequences))
            scan_rout = self.lstmstack.sample_scan_routine(spec, *inputs)

            last_rel_pos, last_out, cur_kwargs = scan_rout.send(None)

            new_pos = self.encoding.get_new_relative_position(last_absolute_chosen, last_rel_pos, last_out, self.bounds.lowbound, self.bounds.highbound, **cur_kwargs)
            addtl_kwargs = {
                "last_output": last_out
            }

            out_activations = scan_rout.send((new_pos, addtl_kwargs))
            out_probs = self.encoding.decode_to_probs(out_activations,new_pos,self.bounds.lowbound, self.bounds.highbound)
            sampled_note = Encoding.sample_absolute_probs(self.srng, out_probs)
            encoded_output = self.encoding.note_to_encoding(sampled_note, new_pos, self.bounds.lowbound, self.bounds.highbound)
            scan_outputs = scan_rout.send(encoded_output)
            scan_rout.close()

            return [sampled_note, out_probs] + scan_outputs

        outputs_info = [{"initial":T.zeros((n_batch,),'int32'), "taps":[-1]}, None] + spec.outputs_info
        result, updates = theano.scan(fn=_scan_fn, sequences=spec.sequences, non_sequences=spec.non_sequences, outputs_info=outputs_info)
        all_chosen = result[0].dimshuffle((1,0))
        all_probs = result[1].dimshuffle((1,0,2))

        self.generate_fun = theano.function(
            inputs=[chord_roots, chord_types],
            updates=updates,
            outputs=all_chosen,
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))

        self.generate_visualize_fun = theano.function(
            inputs=[chord_roots, chord_types],
            updates=updates,
            outputs=[all_chosen, all_probs],
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
コード例 #11
0
def BuildModel(modelSpecs, forTrain=True):
	rng = np.random.RandomState()

	## x is for sequential features and y for matrix (or pairwise) features
	x = T.tensor3('x')
	y = T.tensor4('y')

	## mask for x and y, respectively
	xmask = T.bmatrix('xmask')
	ymask = T.btensor3('ymask')

	xem = None
	##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ):
	if config.EmbeddingUsed(modelSpecs):
		xem = T.tensor3('xem')
		distancePredictor = ResNet4DistMatrix( rng, seqInput=x,
											   matrixInput=y, mask_seq=xmask, mask_matrix=ymask,
											   embedInput=xem, modelSpecs=modelSpecs )
	else:
		distancePredictor = ResNet4DistMatrix( rng, seqInput=x,
											   matrixInput=y, mask_seq=xmask, mask_matrix=ymask,
											   modelSpecs=modelSpecs )

	## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] )
	labelList = []
	if forTrain:
		## when this model is used for training. We need to define the label variable
		for response in modelSpecs['responses']:
			labelType = Response2LabelType(response)
			rValDims = config.responseValueDims[labelType]

			if labelType.startswith('Discrete'):
				if rValDims > 1:
				## if one response is a vector, then we use a 4-d tensor
				## wtensor is for 16bit integer
					labelList.append( T.wtensor4('Tlabel4' + response ) )
				else:
					labelList.append( T.wtensor3('Tlabel4' + response ) )
			else:
				if rValDims > 1:
					labelList.append( T.tensor4('Tlabel4' + response ) )
				else:
					labelList.append( T.tensor3('Tlabel4' + response ) )

	## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen)
	weightList = []
	if len(labelList)>0 and modelSpecs['UseSampleWeight']:
		weightList = [ T.tensor3('Tweight4'+response) for response in modelSpecs['responses'] ]

	## for prediction, both labelList and weightList are empty
	return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList
コード例 #12
0
    def setup_encode(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()
        # dimensions: (batch, time)
        chord_roots = T.imatrix()
        # dimensions: (batch, time)
        relative_posns = [T.imatrix() for _ in self.encodings]
        # dimesions: (batch, time, output_data)
        encoded_melodies = [T.btensor3() for _ in self.encodings]
        n_batch, n_time = chord_roots.shape

        all_activations = []
        for encoding, enc_lstmstack, encoded_melody, relative_pos in zip(
                self.encodings, self.enc_lstmstacks, encoded_melodies,
                relative_posns):
            activations = enc_lstmstack.do_preprocess_scan(
                timestep=T.tile(T.arange(n_time), (n_batch, 1)),
                relative_position=relative_pos,
                cur_chord_type=chord_types,
                cur_chord_root=chord_roots,
                cur_input=encoded_melody,
                deterministic_dropout=True)
            all_activations.append(activations)
        reduced_activations = functools.reduce((lambda x, y: x + y),
                                               all_activations)
        strengths, vects = self.qman.get_strengths_and_vects(
            reduced_activations)

        self.encode_fun = theano.function(
            inputs=[chord_types, chord_roots] + relative_posns +
            encoded_melodies,
            outputs=[strengths, vects],
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True,
                               inf_is_error=True,
                               big_is_error=True) if self.nanguard else None))
コード例 #13
0
ファイル: training.py プロジェクト: npow/DCNMT
def main(config, tr_stream, dev_stream):
    # Create Theano variables
    logger.info('Creating theano variables')
    source_char_seq = tensor.lmatrix('source_char_seq')
    source_sample_matrix = tensor.btensor3('source_sample_matrix')
    source_char_aux = tensor.bmatrix('source_char_aux')
    source_word_mask = tensor.bmatrix('source_word_mask')
    target_char_seq = tensor.lmatrix('target_char_seq')
    target_char_aux = tensor.bmatrix('target_char_aux')
    target_char_mask = tensor.bmatrix('target_char_mask')
    target_sample_matrix = tensor.btensor3('target_sample_matrix')
    target_word_mask = tensor.bmatrix('target_word_mask')
    target_resample_matrix = tensor.btensor3('target_resample_matrix')
    target_prev_char_seq = tensor.lmatrix('target_prev_char_seq')
    target_prev_char_aux = tensor.bmatrix('target_prev_char_aux')
    target_bos_idx = tr_stream.trg_bos
    target_space_idx = tr_stream.space_idx['target']

    # Construct model
    logger.info('Building RNN encoder-decoder')

    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'],
                                   config['src_dgru_nhids'],
                                   config['enc_nhids'],
                                   config['src_dgru_depth'],
                                   config['bidir_encoder_depth'])

    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['trg_dgru_nhids'], config['trg_igru_nhids'],
                      config['dec_nhids'], config['enc_nhids'] * 2,
                      config['transition_depth'], config['trg_igru_depth'],
                      config['trg_dgru_depth'], target_space_idx,
                      target_bos_idx)

    representation = encoder.apply(source_char_seq, source_sample_matrix,
                                   source_char_aux, source_word_mask)
    cost = decoder.cost(representation, source_word_mask, target_char_seq,
                        target_sample_matrix, target_resample_matrix,
                        target_char_aux, target_char_mask, target_word_mask,
                        target_prev_char_seq, target_prev_char_aux)

    logger.info('Creating computational graph')
    cg = ComputationGraph(cost)

    # Initialize model
    logger.info('Initializing model')
    encoder.weights_init = decoder.weights_init = IsotropicGaussian(
        config['weight_scale'])
    encoder.biases_init = decoder.biases_init = Constant(0)
    encoder.push_initialization_config()
    decoder.push_initialization_config()
    for layer_n in range(config['src_dgru_depth']):
        encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal()
    for layer_n in range(config['bidir_encoder_depth']):
        encoder.children[
            1 + layer_n].prototype.recurrent.weights_init = Orthogonal()
    if config['trg_igru_depth'] == 1:
        decoder.interpolator.igru.weights_init = Orthogonal()
    else:
        for layer_n in range(config['trg_igru_depth']):
            decoder.interpolator.igru.transitions[
                layer_n].weights_init = Orthogonal()
    for layer_n in range(config['trg_dgru_depth']):
        decoder.interpolator.feedback_brick.dgru.transitions[
            layer_n].weights_init = Orthogonal()
    for layer_n in range(config['transition_depth']):
        decoder.transition.transitions[layer_n].weights_init = Orthogonal()
    encoder.initialize()
    decoder.initialize()

    # Print shapes
    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info('    {:15}: {}'.format(str(shape), count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names
    enc_dec_param_dict = merge(
        Selector(encoder).get_parameters(),
        Selector(decoder).get_parameters())
    logger.info("Parameter names: ")
    for name, value in enc_dec_param_dict.items():
        logger.info('    {:15}: {}'.format(str(value.get_value().shape), name))
    logger.info("Total number of parameters: {}".format(
        len(enc_dec_param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)
    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=CompositeRule([
                                    StepClipping(config['step_clipping']),
                                    eval(config['step_rule'])()
                                ]))

    # Set extensions
    logger.info("Initializing extensions")
    # Extensions
    gradient_norm = aggregation.mean(algorithm.total_gradient_norm)
    step_norm = aggregation.mean(algorithm.total_step_norm)
    train_monitor = CostCurve([cost, gradient_norm, step_norm],
                              config=config,
                              after_batch=True,
                              before_first_epoch=True,
                              prefix='tra')
    extensions = [
        train_monitor,
        Timing(),
        Printing(every_n_batches=config['print_freq']),
        FinishAfter(after_n_batches=config['finish_after']),
        CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])
    ]

    # Set up beam search and sampling computation graphs if necessary
    if config['hook_samples'] >= 1 or config['bleu_script'] is not None:
        logger.info("Building sampling model")
        generated = decoder.generate(representation, source_word_mask)
        search_model = Model(generated)
        _, samples = VariableFilter(
            bricks=[decoder.sequence_generator], name="outputs")(
                ComputationGraph(generated[config['transition_depth']])
            )  # generated[transition_depth] is next_outputs

    # Add sampling
    if config['hook_samples'] >= 1:
        logger.info("Building sampler")
        extensions.append(
            Sampler(model=search_model,
                    data_stream=tr_stream,
                    hook_samples=config['hook_samples'],
                    transition_depth=config['transition_depth'],
                    every_n_batches=config['sampling_freq'],
                    src_vocab_size=config['src_vocab_size']))

    # Add early stopping based on bleu
    if config['bleu_script'] is not None:
        logger.info("Building bleu validator")
        extensions.append(
            BleuValidator(source_char_seq,
                          source_sample_matrix,
                          source_char_aux,
                          source_word_mask,
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()
コード例 #14
0
ファイル: agent_lu_rl.py プロジェクト: zxsted/KB-InfoBot
    def _init_model(self, in_size, out_size, slot_sizes, db, \
            n_hid=10, learning_rate_sl=0.005, learning_rate_rl=0.005, batch_size=32, ment=0.1, \
            inputtype='full', sl='e2e', rl='e2e'):
        self.in_size = in_size
        self.out_size = out_size
        self.slot_sizes = slot_sizes
        self.batch_size = batch_size
        self.learning_rate = learning_rate_rl
        self.n_hid = n_hid
        self.r_hid = self.n_hid
        self.sl = sl
        self.rl = rl

        table = db.table
        counts = db.counts
        m_unk = [db.inv_counts[s][-1] for s in dialog_config.inform_slots]
        prior = [db.priors[s] for s in dialog_config.inform_slots]
        unknown = [db.unks[s] for s in dialog_config.inform_slots]
        ids = [db.ids[s] for s in dialog_config.inform_slots]

        input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.bmatrix('tm'), \
                T.btensor3('am'), T.fvector('r')
        T_var, N_var = T.as_tensor_variable(table), T.as_tensor_variable(
            counts)
        db_index_var = T.imatrix('db')
        db_index_switch = T.bvector('s')

        l_mask_in = L.InputLayer(shape=(None, None), input_var=turn_mask)
        flat_mask = T.reshape(turn_mask,
                              (turn_mask.shape[0] * turn_mask.shape[1], 1))

        def _smooth(p):
            p_n = p + EPS
            return p_n / (p_n.sum(axis=1)[:, np.newaxis])

        def _add_unk(p, m, N):
            # p: B x V, m- num missing, N- total, p0: 1 x V
            t_unk = T.as_tensor_variable(float(m) / N)
            ps = p * (1. - t_unk)
            return T.concatenate([ps, T.tile(t_unk, (ps.shape[0], 1))], axis=1)

        def kl_divergence(p, q):
            p_n = _smooth(p)
            return -T.sum(q * T.log(p_n), axis=1)

        # belief tracking
        l_in = L.InputLayer(shape=(None, None, self.in_size),
                            input_var=input_var)
        p_vars = []
        pu_vars = []
        phi_vars = []
        p_targets = []
        phi_targets = []
        hid_in_vars = []
        hid_out_vars = []
        bt_loss = T.as_tensor_variable(0.)
        kl_loss = []
        x_loss = []
        self.trackers = []
        for i, s in enumerate(dialog_config.inform_slots):
            hid_in = T.fmatrix('h')
            l_rnn = L.GRULayer(l_in, self.r_hid, hid_init=hid_in,  \
                    mask_input=l_mask_in,
                    grad_clipping=10.) # B x H x D
            l_b_in = L.ReshapeLayer(l_rnn,
                                    (input_var.shape[0] * input_var.shape[1],
                                     self.r_hid))  # BH x D
            hid_out = L.get_output(l_rnn)[:, -1, :]

            p_targ = T.ftensor3('p_target_' + s)
            p_t = T.reshape(
                p_targ,
                (p_targ.shape[0] * p_targ.shape[1], self.slot_sizes[i]))
            phi_targ = T.fmatrix('phi_target' + s)
            phi_t = T.reshape(phi_targ,
                              (phi_targ.shape[0] * phi_targ.shape[1], 1))

            l_b = L.DenseLayer(l_b_in,
                               self.slot_sizes[i],
                               nonlinearity=lasagne.nonlinearities.softmax)
            l_phi = L.DenseLayer(l_b_in,
                                 1,
                                 nonlinearity=lasagne.nonlinearities.sigmoid)

            phi = T.clip(L.get_output(l_phi), 0.01, 0.99)
            p = L.get_output(l_b)
            p_u = _add_unk(p, m_unk[i], db.N)
            kl_loss.append(
                T.sum(flat_mask.flatten() * kl_divergence(p, p_t)) /
                T.sum(flat_mask))
            x_loss.append(
                T.sum(flat_mask *
                      lasagne.objectives.binary_crossentropy(phi, phi_t)) /
                T.sum(flat_mask))
            bt_loss += kl_loss[-1] + x_loss[-1]

            p_vars.append(p)
            pu_vars.append(p_u)
            phi_vars.append(phi)
            p_targets.append(p_targ)
            phi_targets.append(phi_targ)
            hid_in_vars.append(hid_in)
            hid_out_vars.append(hid_out)
            self.trackers.append(l_b)
            self.trackers.append(l_phi)
        self.bt_params = L.get_all_params(self.trackers)

        def check_db(pv, phi, Tb, N):
            O = T.alloc(0., pv[0].shape[0], Tb.shape[0])  # BH x T.shape[0]
            for i, p in enumerate(pv):
                p_dc = T.tile(phi[i], (1, Tb.shape[0]))
                O += T.log(p_dc*(1./db.table.shape[0]) + \
                        (1.-p_dc)*(p[:,Tb[:,i]]/N[np.newaxis,:,i]))
            Op = T.exp(O)  #+EPS # BH x T.shape[0]
            Os = T.sum(Op, axis=1)[:, np.newaxis]  # BH x 1
            return Op / Os

        def entropy(p):
            p = _smooth(p)
            return -T.sum(p * T.log(p), axis=-1)

        def weighted_entropy(p, q, p0, unks, idd):
            w = T.dot(idd, q.transpose())  # Pi x BH
            u = p0[np.newaxis, :] * (q[:, unks].sum(axis=1)[:, np.newaxis]
                                     )  # BH x Pi
            p_tilde = w.transpose() + u
            return entropy(p_tilde)

        p_db = check_db(pu_vars, phi_vars, T_var, N_var)  # BH x T.shape[0]

        if inputtype == 'entropy':
            H_vars = [weighted_entropy(pv,p_db,prior[i],unknown[i],ids[i]) \
                    for i,pv in enumerate(p_vars)]
            H_db = entropy(p_db)
            phv = [ph[:, 0] for ph in phi_vars]
            t_in = T.stacklists(H_vars + phv + [H_db]).transpose()  # BH x 2M+1
            t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \
                    t_in.shape[1])) # B x H x 2M+1
            l_in_pol = L.InputLayer(
                    shape=(None,None,2*len(dialog_config.inform_slots)+1), \
                    input_var=t_in_resh)
        else:
            in_reshaped = T.reshape(input_var,
                    (input_var.shape[0]*input_var.shape[1], \
                    input_var.shape[2]))
            prev_act = in_reshaped[:, -len(dialog_config.inform_slots):]
            t_in = T.concatenate(pu_vars + phi_vars + [p_db, prev_act],
                                 axis=1)  # BH x D-sum+A
            t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \
                    t_in.shape[1])) # B x H x D-sum
            l_in_pol = L.InputLayer(shape=(None,None,sum(self.slot_sizes)+ \
                    3*len(dialog_config.inform_slots)+ \
                    table.shape[0]), input_var=t_in_resh)

        pol_in = T.fmatrix('pol-h')
        l_pol_rnn = L.GRULayer(l_in_pol,
                               n_hid,
                               hid_init=pol_in,
                               mask_input=l_mask_in,
                               grad_clipping=10.)  # B x H x D
        pol_out = L.get_output(l_pol_rnn)[:, -1, :]
        l_den_in = L.ReshapeLayer(
            l_pol_rnn,
            (turn_mask.shape[0] * turn_mask.shape[1], n_hid))  # BH x D
        l_out = L.DenseLayer(l_den_in, self.out_size, \
                nonlinearity=lasagne.nonlinearities.softmax) # BH x A

        self.network = l_out
        self.pol_params = L.get_all_params(self.network)
        self.params = self.bt_params + self.pol_params

        # db loss
        p_db_reshaped = T.reshape(
            p_db, (turn_mask.shape[0], turn_mask.shape[1], table.shape[0]))
        p_db_final = p_db_reshaped[:, -1, :]  # B x T.shape[0]
        p_db_final = _smooth(p_db_final)
        ix = T.tile(T.arange(p_db_final.shape[0]),
                    (db_index_var.shape[1], 1)).transpose()
        sample_probs = p_db_final[ix, db_index_var]  # B x K
        if dialog_config.SUCCESS_MAX_RANK == 1:
            log_db_probs = T.log(sample_probs).sum(axis=1)
        else:
            cum_probs,_ = theano.scan(fn=lambda x, prev: x+prev, \
                    outputs_info=T.zeros_like(sample_probs[:,0]), \
                    sequences=sample_probs[:,:-1].transpose())
            cum_probs = T.clip(cum_probs.transpose(), 0., 1. - 1e-5)  # B x K-1
            log_db_probs = T.log(sample_probs).sum(
                axis=1) - T.log(1. - cum_probs).sum(axis=1)  # B
        log_db_probs = log_db_probs * db_index_switch

        # rl
        probs = L.get_output(self.network)  # BH x A
        probs = _smooth(probs)
        out_probs = T.reshape(probs, (turn_mask.shape[0], turn_mask.shape[1],
                                      self.out_size))  # B x H x A
        log_probs = T.log(out_probs)
        act_probs = (log_probs * act_mask).sum(axis=2)  # B x H
        ep_probs = (act_probs * turn_mask).sum(axis=1)  # B
        H_probs = -T.sum(T.sum(out_probs * log_probs, axis=2), axis=1)  # B
        self.act_loss = -T.mean(ep_probs * reward_var)
        self.db_loss = -T.mean(log_db_probs * reward_var)
        self.reg_loss = -T.mean(ment * H_probs)
        self.loss = self.act_loss + self.db_loss + self.reg_loss

        self.inps = [input_var, turn_mask, act_mask, reward_var, db_index_var, db_index_switch, \
                pol_in] + hid_in_vars
        self.obj_fn = theano.function(self.inps,
                                      self.loss,
                                      on_unused_input='warn')
        self.act_fn = theano.function([input_var,turn_mask,pol_in]+hid_in_vars, \
                [out_probs,p_db,pol_out]+pu_vars+phi_vars+hid_out_vars, on_unused_input='warn')
        self.debug_fn = theano.function(self.inps, [probs, p_db, self.loss],
                                        on_unused_input='warn')
        self._rl_train_fn(self.learning_rate)

        ## sl
        sl_loss = 0. + bt_loss - T.mean(ep_probs)

        if self.sl == 'e2e':
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)
        elif self.sl == 'bel':
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.bt_params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)
        else:
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.pol_params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)

        sl_inps = [input_var, turn_mask, act_mask, pol_in
                   ] + p_targets + phi_targets + hid_in_vars
        self.sl_train_fn = theano.function(sl_inps, [sl_loss]+kl_loss+x_loss, updates=sl_updates, \
                on_unused_input='warn')
        self.sl_obj_fn = theano.function(sl_inps,
                                         sl_loss,
                                         on_unused_input='warn')
コード例 #15
0
    def setup_train(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()

        # dimensions: (batch, time)
        chord_roots = T.imatrix()

        # dimensions: (batch, time)
        relative_posns = [T.imatrix() for _ in self.encodings]

        # dimesions: (batch, time, output_data)
        encoded_melodies = [T.btensor3() for _ in self.encodings]

        # dimesions: (batch, time)
        correct_notes = T.imatrix()

        n_batch, n_time = chord_roots.shape

        def _build(det_dropout):
            all_out_probs = []
            for encoding, lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.lstmstacks, encoded_melodies, relative_posns):
                activations = lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) ,
                                                            relative_position=relative_pos,
                                                            cur_chord_type=chord_types,
                                                            cur_chord_root=chord_roots,
                                                            last_output=T.concatenate([T.tile(encoding.initial_encoded_form(), (n_batch,1,1)),
                                                                                encoded_melody[:,:-1,:] ], 1),
                                                            deterministic_dropout=det_dropout)

                out_probs = encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound)
                all_out_probs.append(out_probs)
            reduced_out_probs = functools.reduce((lambda x,y: x*y), all_out_probs)
            if self.normalize_artic_only:
                non_artic_probs = reduced_out_probs[:,:,:2]
                artic_probs = reduced_out_probs[:,:,2:]
                non_artic_sum = T.sum(non_artic_probs, 2, keepdims=True)
                artic_sum = T.sum(artic_probs, 2, keepdims=True)
                norm_artic_probs = artic_probs*(1-non_artic_sum)/artic_sum
                norm_out_probs = T.concatenate([non_artic_probs, norm_artic_probs], 2)
            else:
                normsum = T.sum(reduced_out_probs, 2, keepdims=True)
                normsum = T.maximum(normsum, constants.EPSILON)
                norm_out_probs = reduced_out_probs/normsum
            return Encoding.compute_loss(norm_out_probs, correct_notes, True)

        train_loss, train_info = _build(False)
        updates = Adam(train_loss, self.get_optimize_params(), lr=self.learning_rate_var)

        eval_loss, eval_info = _build(True)

        self.loss_info_keys = list(train_info.keys())

        self.update_fun = theano.function(
            inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies,
            outputs=[train_loss]+list(train_info.values()),
            updates=updates,
            allow_input_downcast=True,
            on_unused_input='ignore',
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))

        self.eval_fun = theano.function(
            inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies,
            outputs=[eval_loss]+list(eval_info.values()),
            allow_input_downcast=True,
            on_unused_input='ignore',
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
コード例 #16
0
ファイル: agent_lu_rl.py プロジェクト: SoluMilken/KB-InfoBot
    def _init_model(self, in_size, out_size, slot_sizes, db, \
            n_hid=10, learning_rate_sl=0.005, learning_rate_rl=0.005, batch_size=32, ment=0.1, \
            inputtype='full', sl='e2e', rl='e2e'):
        self.in_size = in_size
        self.out_size = out_size
        self.slot_sizes = slot_sizes
        self.batch_size = batch_size
        self.learning_rate = learning_rate_rl
        self.n_hid = n_hid
        self.r_hid = self.n_hid
        self.sl = sl
        self.rl = rl

        table = db.table
        counts = db.counts
        m_unk = [db.inv_counts[s][-1] for s in dialog_config.inform_slots]
        prior = [db.priors[s] for s in dialog_config.inform_slots]
        unknown = [db.unks[s] for s in dialog_config.inform_slots]
        ids = [db.ids[s] for s in dialog_config.inform_slots]

        input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.bmatrix('tm'), \
                T.btensor3('am'), T.fvector('r')
        T_var, N_var = T.as_tensor_variable(table), T.as_tensor_variable(counts)
        db_index_var = T.imatrix('db')
        db_index_switch = T.bvector('s')

        l_mask_in = L.InputLayer(shape=(None,None), input_var=turn_mask)
        flat_mask = T.reshape(turn_mask, (turn_mask.shape[0]*turn_mask.shape[1],1))

        def _smooth(p):
            p_n = p+EPS
            return p_n/(p_n.sum(axis=1)[:,np.newaxis])

        def _add_unk(p,m,N):
            # p: B x V, m- num missing, N- total, p0: 1 x V
            t_unk = T.as_tensor_variable(float(m)/N)
            ps = p*(1.-t_unk)
            return T.concatenate([ps, T.tile(t_unk, (ps.shape[0],1))], axis=1)

        def kl_divergence(p,q):
            p_n = _smooth(p)
            return -T.sum(q*T.log(p_n), axis=1)

        # belief tracking
        l_in = L.InputLayer(shape=(None,None,self.in_size), input_var=input_var)
        p_vars = []
        pu_vars = []
        phi_vars = []
        p_targets = []
        phi_targets = []
        hid_in_vars = []
        hid_out_vars = []
        bt_loss = T.as_tensor_variable(0.)
        kl_loss = []
        x_loss = []
        self.trackers = []
        for i,s in enumerate(dialog_config.inform_slots):
            hid_in = T.fmatrix('h')
            l_rnn = L.GRULayer(l_in, self.r_hid, hid_init=hid_in,  \
                    mask_input=l_mask_in,
                    grad_clipping=10.) # B x H x D
            l_b_in = L.ReshapeLayer(l_rnn, 
                    (input_var.shape[0]*input_var.shape[1], self.r_hid)) # BH x D
            hid_out = L.get_output(l_rnn)[:,-1,:]

            p_targ = T.ftensor3('p_target_'+s)
            p_t = T.reshape(p_targ, 
                    (p_targ.shape[0]*p_targ.shape[1],self.slot_sizes[i]))
            phi_targ = T.fmatrix('phi_target'+s)
            phi_t = T.reshape(phi_targ, (phi_targ.shape[0]*phi_targ.shape[1], 1))

            l_b = L.DenseLayer(l_b_in, self.slot_sizes[i], 
                    nonlinearity=lasagne.nonlinearities.softmax)
            l_phi = L.DenseLayer(l_b_in, 1, 
                    nonlinearity=lasagne.nonlinearities.sigmoid)

            phi = T.clip(L.get_output(l_phi), 0.01, 0.99)
            p = L.get_output(l_b)
            p_u = _add_unk(p, m_unk[i], db.N)
            kl_loss.append(T.sum(flat_mask.flatten()*kl_divergence(p, p_t))/T.sum(flat_mask))
            x_loss.append(T.sum(flat_mask*lasagne.objectives.binary_crossentropy(phi,phi_t))/
                    T.sum(flat_mask))
            bt_loss += kl_loss[-1] + x_loss[-1]

            p_vars.append(p)
            pu_vars.append(p_u)
            phi_vars.append(phi)
            p_targets.append(p_targ)
            phi_targets.append(phi_targ)
            hid_in_vars.append(hid_in)
            hid_out_vars.append(hid_out)
            self.trackers.append(l_b)
            self.trackers.append(l_phi)
        self.bt_params = L.get_all_params(self.trackers)

        def check_db(pv, phi, Tb, N):
            O = T.alloc(0.,pv[0].shape[0],Tb.shape[0]) # BH x T.shape[0]
            for i,p in enumerate(pv):
                p_dc = T.tile(phi[i], (1, Tb.shape[0]))
                O += T.log(p_dc*(1./db.table.shape[0]) + \
                        (1.-p_dc)*(p[:,Tb[:,i]]/N[np.newaxis,:,i]))
            Op = T.exp(O)#+EPS # BH x T.shape[0]
            Os = T.sum(Op, axis=1)[:,np.newaxis] # BH x 1
            return Op/Os

        def entropy(p):
            p = _smooth(p)
            return -T.sum(p*T.log(p), axis=-1)

        def weighted_entropy(p,q,p0,unks,idd):
            w = T.dot(idd,q.transpose()) # Pi x BH
            u = p0[np.newaxis,:]*(q[:,unks].sum(axis=1)[:,np.newaxis]) # BH x Pi
            p_tilde = w.transpose()+u
            return entropy(p_tilde)

        p_db = check_db(pu_vars, phi_vars, T_var, N_var) # BH x T.shape[0]
        
        if inputtype=='entropy':
            H_vars = [weighted_entropy(pv,p_db,prior[i],unknown[i],ids[i]) \
                    for i,pv in enumerate(p_vars)]
            H_db = entropy(p_db)
            phv = [ph[:,0] for ph in phi_vars]
            t_in = T.stacklists(H_vars+phv+[H_db]).transpose() # BH x 2M+1
            t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \
                    t_in.shape[1])) # B x H x 2M+1
            l_in_pol = L.InputLayer(
                    shape=(None,None,2*len(dialog_config.inform_slots)+1), \
                    input_var=t_in_resh)
        else:
            in_reshaped = T.reshape(input_var, 
                    (input_var.shape[0]*input_var.shape[1], \
                    input_var.shape[2]))
            prev_act = in_reshaped[:,-len(dialog_config.inform_slots):]
            t_in = T.concatenate(pu_vars+phi_vars+[p_db,prev_act], 
                    axis=1) # BH x D-sum+A
            t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \
                    t_in.shape[1])) # B x H x D-sum
            l_in_pol = L.InputLayer(shape=(None,None,sum(self.slot_sizes)+ \
                    3*len(dialog_config.inform_slots)+ \
                    table.shape[0]), input_var=t_in_resh)

        pol_in = T.fmatrix('pol-h')
        l_pol_rnn = L.GRULayer(l_in_pol, n_hid, hid_init=pol_in, 
                mask_input=l_mask_in,
                grad_clipping=10.) # B x H x D
        pol_out = L.get_output(l_pol_rnn)[:,-1,:]
        l_den_in = L.ReshapeLayer(l_pol_rnn, 
                (turn_mask.shape[0]*turn_mask.shape[1], n_hid)) # BH x D
        l_out = L.DenseLayer(l_den_in, self.out_size, \
                nonlinearity=lasagne.nonlinearities.softmax) # BH x A

        self.network = l_out
        self.pol_params = L.get_all_params(self.network)
        self.params = self.bt_params + self.pol_params

        # db loss
        p_db_reshaped = T.reshape(p_db, (turn_mask.shape[0],turn_mask.shape[1],table.shape[0]))
        p_db_final = p_db_reshaped[:,-1,:] # B x T.shape[0]
        p_db_final = _smooth(p_db_final)
        ix = T.tile(T.arange(p_db_final.shape[0]),(db_index_var.shape[1],1)).transpose()
        sample_probs = p_db_final[ix,db_index_var] # B x K
        if dialog_config.SUCCESS_MAX_RANK==1:
            log_db_probs = T.log(sample_probs).sum(axis=1)
        else:
            cum_probs,_ = theano.scan(fn=lambda x, prev: x+prev, \
                    outputs_info=T.zeros_like(sample_probs[:,0]), \
                    sequences=sample_probs[:,:-1].transpose())
            cum_probs = T.clip(cum_probs.transpose(), 0., 1.-1e-5) # B x K-1
            log_db_probs = T.log(sample_probs).sum(axis=1) - T.log(1.-cum_probs).sum(axis=1) # B
        log_db_probs = log_db_probs * db_index_switch

        # rl
        probs = L.get_output(self.network) # BH x A
        probs = _smooth(probs)
        out_probs = T.reshape(probs, (turn_mask.shape[0],turn_mask.shape[1],self.out_size)) # B x H x A
        log_probs = T.log(out_probs)
        act_probs = (log_probs*act_mask).sum(axis=2) # B x H
        ep_probs = (act_probs*turn_mask).sum(axis=1) # B
        H_probs = -T.sum(T.sum(out_probs*log_probs,axis=2),axis=1) # B
        self.act_loss = -T.mean(ep_probs*reward_var)
        self.db_loss = -T.mean(log_db_probs*reward_var)
        self.reg_loss = -T.mean(ment*H_probs)
        self.loss = self.act_loss + self.db_loss + self.reg_loss

        self.inps = [input_var, turn_mask, act_mask, reward_var, db_index_var, db_index_switch, \
                pol_in] + hid_in_vars
        self.obj_fn = theano.function(self.inps, self.loss, on_unused_input='warn')
        self.act_fn = theano.function([input_var,turn_mask,pol_in]+hid_in_vars, \
                [out_probs,p_db,pol_out]+pu_vars+phi_vars+hid_out_vars, on_unused_input='warn')
        self.debug_fn = theano.function(self.inps, [probs, p_db, self.loss], on_unused_input='warn')
        self._rl_train_fn(self.learning_rate)

        ## sl
        sl_loss = 0. + bt_loss - T.mean(ep_probs) 

        if self.sl=='e2e':
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)
        elif self.sl=='bel':
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.bt_params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)
        else:
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.pol_params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)

        sl_inps = [input_var, turn_mask, act_mask, pol_in] + p_targets + phi_targets + hid_in_vars
        self.sl_train_fn = theano.function(sl_inps, [sl_loss]+kl_loss+x_loss, updates=sl_updates, \
                on_unused_input='warn')
        self.sl_obj_fn = theano.function(sl_inps, sl_loss, on_unused_input='warn')
コード例 #17
0
ファイル: note.py プロジェクト: mihaha/lasagne_seq2seq
def build_network():
    from lasagne.layers import InputLayer, LSTMLayer, ConcatLayer, ReshapeLayer, DenseLayer, get_output, get_all_params
    from lasagne.objectives import categorical_crossentropy
    print("Building network ...")

    # inputs ###############################################
    l_in_x = InputLayer(shape=(BATCH_SIZE, None, vocab_size))
    l_in_y = InputLayer(shape=(BATCH_SIZE, None, vocab_size))

    # encoder ###############################################
    l_enc = LSTMLayer(
        l_in_x, N_HIDDEN, grad_clipping=GRAD_CLIP,
        nonlinearity=lasagne.nonlinearities.tanh,
        only_return_final=True)
    
    # decoder ###############################################
    l_repeated_enc = Repeat(l_enc, SEQ_LENGTH)
    l_conc = ConcatLayer([l_in_y, l_repeated_enc], axis=2)

    l_dec = LSTMLayer(
        l_conc, N_HIDDEN, grad_clipping=GRAD_CLIP,
        nonlinearity=lasagne.nonlinearities.tanh)

    # output ###############################################
    l_dec_long = ReshapeLayer(l_dec, shape=(-1, N_HIDDEN))

    l_dist = DenseLayer(
        l_dec_long,
        num_units=vocab_size,
        nonlinearity=lasagne.nonlinearities.softmax)

    l_out = ReshapeLayer(l_dist, shape=(BATCH_SIZE, -1, vocab_size))

    # print(lasagne.layers.get_output_shape(l_out))

    # compilations ###############################################
    target_values = T.btensor3('target_output')
    network_output = get_output(l_out)
    cost = categorical_crossentropy(network_output, target_values).mean()

    all_params = get_all_params(l_out,trainable=True)
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function(
        inputs=[l_in_x.input_var, l_in_y.input_var, target_values],
        outputs=cost,
        updates=updates,
        allow_input_downcast=True)

    compute_cost = theano.function(
        inputs=[l_in_x.input_var, target_values],
        outputs=cost,
        allow_input_downcast=True)

    predict = theano.function(
        inputs=[l_in_x.input_var],
        outputs=network_output,
        allow_input_downcast=True)

    return train, predict, compute_cost
コード例 #18
0
    def setup_train(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()
        # dimensions: (batch, time)
        chord_roots = T.imatrix()
        # dimensions: (batch, time)
        relative_posns = [T.imatrix() for _ in self.encodings]
        # dimesions: (batch, time, output_data)
        encoded_melodies = [T.btensor3() for _ in self.encodings]
        # dimesions: (batch, time)
        correct_notes = T.imatrix()
        n_batch, n_time = chord_roots.shape

        def _build(det_dropout):
            all_activations = []
            for encoding, enc_lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.enc_lstmstacks, encoded_melodies, relative_posns):
                activations = enc_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) ,
                                                            relative_position=relative_pos,
                                                            cur_chord_type=chord_types,
                                                            cur_chord_root=chord_roots,
                                                            cur_input=encoded_melody,
                                                            deterministic_dropout=det_dropout)
                all_activations.append(activations)
            reduced_activations = functools.reduce((lambda x,y: x+y), all_activations)
            queue_loss, feat_strengths, feat_vects, queue_info = self.qman.process(reduced_activations, extra_info=True)
            features = QueueManager.queue_transform(feat_strengths, feat_vects)

            all_out_probs = []
            for encoding, dec_lstmstack, encoded_melody, relative_pos in zip(self.encodings, self.dec_lstmstacks, encoded_melodies, relative_posns):
                activations = dec_lstmstack.do_preprocess_scan( timestep=T.tile(T.arange(n_time), (n_batch,1)) ,
                                                            relative_position=relative_pos,
                                                            cur_chord_type=chord_types,
                                                            cur_chord_root=chord_roots,
                                                            cur_feature=features,
                                                            last_output=T.concatenate([T.tile(encoding.initial_encoded_form(), (n_batch,1,1)),
                                                                                encoded_melody[:,:-1,:] ], 1),
                                                            deterministic_dropout=det_dropout)
                out_probs = encoding.decode_to_probs(activations, relative_pos, self.bounds.lowbound, self.bounds.highbound)
                all_out_probs.append(out_probs)

            reduced_out_probs = functools.reduce((lambda x,y: x*y), all_out_probs)
            normsum = T.sum(reduced_out_probs, 2, keepdims=True)
            normsum = T.maximum(normsum, constants.EPSILON)
            norm_out_probs = reduced_out_probs/normsum
            reconstruction_loss, reconstruction_info = Encoding.compute_loss(norm_out_probs, correct_notes, extra_info=True)

            queue_surrogate_loss_parts = self.qman.surrogate_loss(reconstruction_loss, queue_info)

            updates = []
            full_info = queue_info.copy()
            full_info.update(reconstruction_info)
            full_info["queue_loss"] = queue_loss
            full_info["reconstruction_loss"] = reconstruction_loss

            float_n_batch = T.cast(n_batch,'float32')
            if self.loss_mode is "add":
                full_loss = queue_loss + reconstruction_loss
            elif self.loss_mode is "priority":
                curviness = np.array(self.loss_mode_params[0], np.float32)*float_n_batch
                # ln( e^x + e^y - 1 )
                # ln( C(e^x + e^y - 1) ) - ln(C)
                # ln( e^c(e^x + e^y - 1) ) - c
                # ln( e^(x+c) + e^(y+c) - e^c ) - c
                # ln( e^(x-c) + e^(y-c) - e^(-c) ) + c
                # Now let c = maximum(x,y), d = minimum(x,y). WOLOG replace x=c, y=d
                # ln( e^(c-c) + e^(d-c) - e^(-c) ) + c
                # ln( 1 + e^(d-c) - e^(-c) ) + c
                x = reconstruction_loss/curviness
                y = queue_loss/curviness
                c = T.maximum(x,y)
                d = T.minimum(x,y)
                full_loss = (T.log( 1 + T.exp(d-c) - T.exp(-c)) + c)*curviness
            elif self.loss_mode is "cutoff":
                cutoff_val = np.array(self.loss_mode_params[0], np.float32)
                full_loss = T.switch(reconstruction_loss<cutoff_val*float_n_batch, reconstruction_loss+queue_loss, reconstruction_loss)
            elif self.loss_mode is "trigger":
                trigger_val = np.array(self.loss_mode_params[0], np.float32)
                trigger_speed = np.array(1.0/self.loss_mode_params[1], np.float32)
                trigger_is_on = theano.shared(np.array(0, np.int8))
                trigger_scale = theano.shared(np.array(0.0, np.float32))
                full_loss = reconstruction_loss + trigger_scale * queue_loss
                updates.append((trigger_is_on, T.or_(trigger_is_on, reconstruction_loss<trigger_val*float_n_batch)))
                updates.append((trigger_scale, T.switch(trigger_is_on, T.minimum(trigger_scale + trigger_speed, np.array(1.0,np.float32)), np.array(0.0,np.float32))))
                full_info["trigger_scale"] = trigger_scale

            if queue_surrogate_loss_parts is not None:
                surrogate_loss, addtl_updates = queue_surrogate_loss_parts
                full_loss = full_loss + surrogate_loss
                updates.extend(addtl_updates)
                full_info["surrogate_loss"] = surrogate_loss

            return full_loss, full_info, updates

        train_loss, train_info, train_updates = _build(False)
        if self.train_decoder_only:
            params = list(itertools.chain(*(lstmstack.params for lstmstack in self.dec_lstmstacks)))
        else:
            params = self.params
        adam_updates = Adam(train_loss, params, lr=self.learning_rate_var)

        eval_loss, eval_info, _ = _build(True)

        self.loss_info_keys = list(train_info.keys())

        self.update_fun = theano.function(
            inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies,
            outputs=[train_loss]+list(train_info.values()),
            updates=train_updates+adam_updates,
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))

        self.eval_fun = theano.function(
            inputs=[chord_types, chord_roots, correct_notes] + relative_posns + encoded_melodies,
            outputs=[eval_loss]+list(eval_info.values()),
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True) if self.nanguard else None))
コード例 #19
0
    def setup_train(self):

        # dimensions: (batch, time, 12)
        chord_types = T.btensor3()
        # dimensions: (batch, time)
        chord_roots = T.imatrix()
        # dimensions: (batch, time)
        relative_posns = [T.imatrix() for _ in self.encodings]
        # dimesions: (batch, time, output_data)
        encoded_melodies = [T.btensor3() for _ in self.encodings]
        # dimesions: (batch, time)
        correct_notes = T.imatrix()
        n_batch, n_time = chord_roots.shape

        def _build(det_dropout):
            all_activations = []
            for encoding, enc_lstmstack, encoded_melody, relative_pos in zip(
                    self.encodings, self.enc_lstmstacks, encoded_melodies,
                    relative_posns):
                activations = enc_lstmstack.do_preprocess_scan(
                    timestep=T.tile(T.arange(n_time), (n_batch, 1)),
                    relative_position=relative_pos,
                    cur_chord_type=chord_types,
                    cur_chord_root=chord_roots,
                    cur_input=encoded_melody,
                    deterministic_dropout=det_dropout)
                all_activations.append(activations)
            reduced_activations = functools.reduce((lambda x, y: x + y),
                                                   all_activations)
            queue_loss, feat_strengths, feat_vects, queue_info = self.qman.process(
                reduced_activations, extra_info=True)
            features = QueueManager.queue_transform(feat_strengths, feat_vects)

            all_out_probs = []
            for encoding, dec_lstmstack, encoded_melody, relative_pos in zip(
                    self.encodings, self.dec_lstmstacks, encoded_melodies,
                    relative_posns):
                activations = dec_lstmstack.do_preprocess_scan(
                    timestep=T.tile(T.arange(n_time), (n_batch, 1)),
                    relative_position=relative_pos,
                    cur_chord_type=chord_types,
                    cur_chord_root=chord_roots,
                    cur_feature=features,
                    last_output=T.concatenate([
                        T.tile(encoding.initial_encoded_form(),
                               (n_batch, 1, 1)), encoded_melody[:, :-1, :]
                    ], 1),
                    deterministic_dropout=det_dropout)
                out_probs = encoding.decode_to_probs(activations, relative_pos,
                                                     self.bounds.lowbound,
                                                     self.bounds.highbound)
                all_out_probs.append(out_probs)

            reduced_out_probs = functools.reduce((lambda x, y: x * y),
                                                 all_out_probs)
            normsum = T.sum(reduced_out_probs, 2, keepdims=True)
            normsum = T.maximum(normsum, constants.EPSILON)
            norm_out_probs = reduced_out_probs / normsum
            reconstruction_loss, reconstruction_info = Encoding.compute_loss(
                norm_out_probs, correct_notes, extra_info=True)

            queue_surrogate_loss_parts = self.qman.surrogate_loss(
                reconstruction_loss, queue_info)

            updates = []
            full_info = queue_info.copy()
            full_info.update(reconstruction_info)
            full_info["queue_loss"] = queue_loss
            full_info["reconstruction_loss"] = reconstruction_loss

            float_n_batch = T.cast(n_batch, 'float32')
            if self.loss_mode is "add":
                full_loss = queue_loss + reconstruction_loss
            elif self.loss_mode is "priority":
                curviness = np.array(self.loss_mode_params[0],
                                     np.float32) * float_n_batch
                # ln( e^x + e^y - 1 )
                # ln( C(e^x + e^y - 1) ) - ln(C)
                # ln( e^c(e^x + e^y - 1) ) - c
                # ln( e^(x+c) + e^(y+c) - e^c ) - c
                # ln( e^(x-c) + e^(y-c) - e^(-c) ) + c
                # Now let c = maximum(x,y), d = minimum(x,y). WOLOG replace x=c, y=d
                # ln( e^(c-c) + e^(d-c) - e^(-c) ) + c
                # ln( 1 + e^(d-c) - e^(-c) ) + c
                x = reconstruction_loss / curviness
                y = queue_loss / curviness
                c = T.maximum(x, y)
                d = T.minimum(x, y)
                full_loss = (T.log(1 + T.exp(d - c) - T.exp(-c)) +
                             c) * curviness
            elif self.loss_mode is "cutoff":
                cutoff_val = np.array(self.loss_mode_params[0], np.float32)
                full_loss = T.switch(
                    reconstruction_loss < cutoff_val * float_n_batch,
                    reconstruction_loss + queue_loss, reconstruction_loss)
            elif self.loss_mode is "trigger":
                trigger_val = np.array(self.loss_mode_params[0], np.float32)
                trigger_speed = np.array(1.0 / self.loss_mode_params[1],
                                         np.float32)
                trigger_is_on = theano.shared(np.array(0, np.int8))
                trigger_scale = theano.shared(np.array(0.0, np.float32))
                full_loss = reconstruction_loss + trigger_scale * queue_loss
                updates.append(
                    (trigger_is_on,
                     T.or_(trigger_is_on,
                           reconstruction_loss < trigger_val * float_n_batch)))
                updates.append((trigger_scale,
                                T.switch(
                                    trigger_is_on,
                                    T.minimum(trigger_scale + trigger_speed,
                                              np.array(1.0, np.float32)),
                                    np.array(0.0, np.float32))))
                full_info["trigger_scale"] = trigger_scale

            if queue_surrogate_loss_parts is not None:
                surrogate_loss, addtl_updates = queue_surrogate_loss_parts
                full_loss = full_loss + surrogate_loss
                updates.extend(addtl_updates)
                full_info["surrogate_loss"] = surrogate_loss

            return full_loss, full_info, updates

        train_loss, train_info, train_updates = _build(False)
        if self.train_decoder_only:
            params = list(
                itertools.chain(*(lstmstack.params
                                  for lstmstack in self.dec_lstmstacks)))
        else:
            params = self.params
        adam_updates = Adam(train_loss, params, lr=self.learning_rate_var)

        eval_loss, eval_info, _ = _build(True)

        self.loss_info_keys = list(train_info.keys())

        self.update_fun = theano.function(
            inputs=[chord_types, chord_roots, correct_notes] + relative_posns +
            encoded_melodies,
            outputs=[train_loss] + list(train_info.values()),
            updates=train_updates + adam_updates,
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True,
                               inf_is_error=True,
                               big_is_error=True) if self.nanguard else None))

        self.eval_fun = theano.function(
            inputs=[chord_types, chord_roots, correct_notes] + relative_posns +
            encoded_melodies,
            outputs=[eval_loss] + list(eval_info.values()),
            allow_input_downcast=True,
            mode=(NanGuardMode(nan_is_error=True,
                               inf_is_error=True,
                               big_is_error=True) if self.nanguard else None))
コード例 #20
0
def TestResNet4DistMatrix():
    x = T.tensor3('x')
    y = T.tensor4('y')
    xmask = T.bmatrix('xmask')
    ymask = T.btensor3('ymask')
    selection = T.wtensor3('selection')

    import cPickle
    fh = open('seqDataset4HF.pkl')
    data = cPickle.load(fh)
    fh.close()

    distancePredictor = ResNet4DistMatrix(rng=np.random.RandomState(),
                                          seqInput=x,
                                          matrixInput=y,
                                          n_in_seq=data[0][0].shape[2],
                                          n_in_matrix=data[1][0].shape[3],
                                          n_hiddens_seq=[3, 5],
                                          n_hiddens_matrix=[2],
                                          hwsz_seq=4,
                                          hwsz_matrix=4,
                                          mask_seq=xmask,
                                          mask_matrix=ymask)
    """
	f = theano.function([x, y, xmask, ymask], distancePredictor.output_1d)
	g = theano.function([x, y, xmask, ymask], distancePredictor.output_2d)
	"""

    dataLen = 300
    batchSize = 60
    a = np.random.uniform(0, 1, (batchSize, dataLen, 20)).astype(np.float32)
    b = np.random.uniform(0, 1,
                          (batchSize, dataLen, dataLen, 3)).astype(np.float32)
    amask = np.zeros((batchSize, 0)).astype(np.int8)
    bmask = np.zeros((batchSize, 0, dataLen)).astype(np.int8)
    sel = np.ones((batchSize, dataLen, dataLen)).astype(np.int8)
    #print a
    #print b
    c = np.random.uniform(0, 3, (batchSize, dataLen, dataLen)).round().astype(
        np.int8)
    np.putmask(c, c >= 2, 2)
    """
	c[0, 1, 13]=1
	c[0, 2, 15]=1
	c[0, 4, 16]=1
	
	c[0, 1, 27]=1
	c[0, 2, 28]=1
	c[0, 4, 29]=1
	
	c[1, 0, 13]=2
	c[1, 1, 15]=2
	c[1, 3, 16]=2
	
	c[2, 0, 23]=2
	c[2, 1, 25]=2
	c[2, 3, 26]=2
	"""
    #sel = c

    #out1d = f(a, b, amask, bmask)
    #out2d = g(a, b, amask, bmask)

    #print out1d
    #print out2d

    z = T.btensor3('z')
    loss = distancePredictor.loss(z, selection)
    errs = distancePredictor.ErrorsByRange(z)
    accs = distancePredictor.TopAccuracyByRange(z)
    confM = distancePredictor.confusionMatrix(z)

    h = theano.function([x, y, xmask, ymask, selection, z],
                        confM,
                        on_unused_input='ignore')
    #l, e, accu = h(a, b, amask, bmask, sel, c)

    cms = []
    for i in np.arange(5):
        cm = h(data[0][i], data[1][i], data[2][i], data[3][i], data[4][i],
               data[5][i])
        print(cm)
        cms.append(cm)

    sumofcms = np.sum(cms, axis=0) * 1.

    for i in range(sumofcms.shape[0]):
        sumofcms[i] /= np.sum(sumofcms[i])

    confusions = sumofcms
    print(confusions)
    print(np.sum(confusions[0]))
    print(np.sum(confusions[1]))
    print(np.sum(confusions[2]))
    """
コード例 #21
0
ファイル: testing.py プロジェクト: ishaansharma/DCNMT
def main(config, test_stream, testing_model):
    # Create Theano variables
    logger.info('Creating theano variables')
    source_char_seq = tensor.lmatrix('source_char_seq')
    source_sample_matrix = tensor.btensor3('source_sample_matrix')
    source_char_aux = tensor.bmatrix('source_char_aux')
    source_word_mask = tensor.bmatrix('source_word_mask')
    target_char_seq = tensor.lmatrix('target_char_seq')
    target_char_aux = tensor.bmatrix('target_char_aux')
    target_char_mask = tensor.bmatrix('target_char_mask')
    target_sample_matrix = tensor.btensor3('target_sample_matrix')
    target_word_mask = tensor.bmatrix('target_word_mask')
    target_resample_matrix = tensor.btensor3('target_resample_matrix')
    target_prev_char_seq = tensor.lmatrix('target_prev_char_seq')
    target_prev_char_aux = tensor.bmatrix('target_prev_char_aux')
    target_bos_idx = test_stream.trg_bos
    target_space_idx = test_stream.space_idx['target']

    # Construct model
    logger.info('Building RNN encoder-decoder')

    encoder = BidirectionalEncoder(config['src_vocab_size'],
                                   config['enc_embed'],
                                   config['src_dgru_nhids'],
                                   config['enc_nhids'],
                                   config['src_dgru_depth'],
                                   config['bidir_encoder_depth'])

    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'],
                      config['trg_dgru_nhids'], config['trg_igru_nhids'],
                      config['dec_nhids'], config['enc_nhids'] * 2,
                      config['transition_depth'], config['trg_igru_depth'],
                      config['trg_dgru_depth'], target_space_idx,
                      target_bos_idx)

    representation = encoder.apply(source_char_seq, source_sample_matrix,
                                   source_char_aux, source_word_mask)
    cost = decoder.cost(representation, source_word_mask, target_char_seq,
                        target_sample_matrix, target_resample_matrix,
                        target_char_aux, target_char_mask, target_word_mask,
                        target_prev_char_seq, target_prev_char_aux)

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # Set extensions
    logger.info("Initializing extensions")
    # Extensions
    extensions = []
    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(testing_model))

    # Set up beam search and sampling computation graphs if necessary
    if config['bleu_script'] is not None:
        logger.info("Building sampling model")
        generated = decoder.generate(representation, source_word_mask)
        search_model = Model(generated)
        _, samples = VariableFilter(bricks=[decoder.sequence_generator],
                                    name="outputs")(ComputationGraph(
                                        generated[config['transition_depth']]))
        # generated[config['transition_depth']] is next_outputs

        logger.info("Building bleu tester")
        extensions.append(
            BleuTester(source_char_seq,
                       source_sample_matrix,
                       source_char_aux,
                       source_word_mask,
                       samples=samples,
                       config=config,
                       model=search_model,
                       data_stream=test_stream,
                       testing_model=testing_model,
                       normalize=config['normalized_bleu']))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=None,
                         data_stream=None,
                         extensions=extensions)

    for extension in main_loop.extensions:
        extension.main_loop = main_loop
    main_loop._run_extensions('before_training')
コード例 #22
0
def build_network():
    from lasagne.layers import InputLayer, LSTMLayer, ConcatLayer, ReshapeLayer, DenseLayer, get_output, get_all_params
    from lasagne.objectives import categorical_crossentropy
    print("Building network ...")

    # inputs ###############################################
    l_in_x = InputLayer(shape=(BATCH_SIZE, None, vocab_size))
    l_in_y = InputLayer(shape=(BATCH_SIZE, None, vocab_size))

    # encoder ###############################################
    l_enc = LSTMLayer(l_in_x,
                      N_HIDDEN,
                      grad_clipping=GRAD_CLIP,
                      nonlinearity=lasagne.nonlinearities.tanh,
                      only_return_final=True)

    # decoder ###############################################
    l_repeated_enc = Repeat(l_enc, SEQ_LENGTH)
    l_conc = ConcatLayer([l_in_y, l_repeated_enc], axis=2)

    l_dec = LSTMLayer(l_conc,
                      N_HIDDEN,
                      grad_clipping=GRAD_CLIP,
                      nonlinearity=lasagne.nonlinearities.tanh)

    # output ###############################################
    l_dec_long = ReshapeLayer(l_dec, shape=(-1, N_HIDDEN))

    l_dist = DenseLayer(l_dec_long,
                        num_units=vocab_size,
                        nonlinearity=lasagne.nonlinearities.softmax)

    l_out = ReshapeLayer(l_dist, shape=(BATCH_SIZE, -1, vocab_size))

    # print(lasagne.layers.get_output_shape(l_out))

    # compilations ###############################################
    target_values = T.btensor3('target_output')
    network_output = get_output(l_out)
    cost = categorical_crossentropy(network_output, target_values).mean()

    all_params = get_all_params(l_out, trainable=True)
    print("Computing updates ...")
    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function(
        inputs=[l_in_x.input_var, l_in_y.input_var, target_values],
        outputs=cost,
        updates=updates,
        allow_input_downcast=True)

    compute_cost = theano.function(inputs=[l_in_x.input_var, target_values],
                                   outputs=cost,
                                   allow_input_downcast=True)

    predict = theano.function(inputs=[l_in_x.input_var],
                              outputs=network_output,
                              allow_input_downcast=True)

    return train, predict, compute_cost
コード例 #23
0
ファイル: charlstm.py プロジェクト: berangerd/charlstm
    def model_setup(self, mfile=None, num_units1=128, num_units2=128,
                          lrate=2e-3, drate=0.95, eps=1e-8, bptt_maxdepth=50,
                          l1=0, l2=0, char_dim=None):
        """initialization of the 2-layer LSTM model for learning or for
           the generation of sequences"""

        # the default parameters are identical to Andrej Karpathy's
        # (see https://github.com/karpathy/char-rnn)

        # 2-layer LSTM parameters
        self.p = {'U1': None, 'W1': None, 'b1': None,
                  'U2': None, 'W2': None, 'b2': None,
                  'V': None, 'c': None}
        # learning parameters
        self.lp = {'lrate': lrate, # learning rate
                   'drate': drate, # decay rate for rmsprop
                   'eps': eps, # epsilon parameter for rmsprop
                   'bptt_maxdepth': bptt_maxdepth, # backpropagation cutoff
                   'l1': l1, # L1 regularization parameter
                   'l2': l2 # L2 regularization parameter
                  }

        if mfile is not None: # loading parameters from an npz file
            np_init = self.load_params(mfile)
            num_units1 = np_init['b1'].shape[1]
            num_units2 = np_init['b2'].shape[1]
        else:
            if char_dim is None:
                if self.uchar:
                    char_dim = len(self.uchar)
                else:
                    raise Exception('prepare_input() should be run before ' +
                                    'model_setup() unless mfile is provided')

            # initialize small random weights
            r_char_dim = np.sqrt(1./(char_dim))
            r_units1 = np.sqrt(1./(num_units1))
            r_units2 = np.sqrt(1./(num_units2))

            def uniform(rng, shape):
                return np.random.uniform(-rng, rng,
                                         shape).astype(theano.config.floatX)

            def randn(rng, shape):
                return np.random.uniform(-rng, rng,
                                         shape).astype(theano.config.floatX)

            def bias_hack(num_units):
                b = np.zeros((4, num_units))
                b[0] = 1. # forget gate hack
                          # helps the network remember information
                return b.astype(theano.config.floatX)

            def zeros(shape):
                return np.zeros(shape).astype(theano.config.floatX)

            def ones(shape):
                return np.ones(shape).astype(theano.config.floatX)

            # parameters for the gates
            # [0]: forget
            # [1]: input
            # [2]: output
            # [3]: cell state update
            np_init = {}

            # first layer
            np_init['U1'] = uniform(r_char_dim, (4, num_units1, char_dim))
            np_init['W1'] = uniform(r_units1, (4, num_units1, num_units1))
            np_init['b1'] = bias_hack(num_units1)

            # second layer
            np_init['U2'] = uniform(r_units1, (4, num_units2, num_units1))
            np_init['W2'] = uniform(r_units2, (4, num_units2, num_units2))
            np_init['b2'] = bias_hack(num_units2)

            # parameters for the last layer (cell output -> network output)
            np_init['V'] = uniform(r_units2, (char_dim, num_units2))
            np_init['c'] = zeros(char_dim)

            # dynamical learning rate (in case the user wants to modify it
            # during the learning process)
            if theano.config.floatX == 'float32':
                dyn_lrate_init = np.float32(self.lp['lrate'])
            else:
                dyn_lrate_init = np.float64(self.lp['lrate'])
            self.dyn_lrate = theano.shared(dyn_lrate_init, name='dyn_lrate')

            # parameters for rmsprop (running average of gradients)
            msq_g = {}
            for param in self.p:
                msq_g[param] = theano.shared(zeros(np_init[param].shape),
                                             name='msq_g'+param)

        for param in self.p:
            self.p[param] = theano.shared(np_init[param], name=param)

        if self.batch_size > 1:
            x = T.imatrix('x')
            y = T.btensor3('y')
        else:
            x = T.ivector('x')
            y = T.bmatrix('y')

        def forward_prop(x, ht1m1, Ct1m1, ht2m1, Ct2m1,
                         U1, W1, b1, U2, W2, b2, V, c):
            # defines each time step of the RNN model

            if self.batch_size > 1: # transform into column vectors
                col_b1 = b1.dimshuffle((0,1,'x'))
                col_b2 = b2.dimshuffle((0,1,'x'))
                col_c = c.dimshuffle((0,'x'))
            else:
                col_b1 = b1
                col_b2 = b2
                col_c = c

            # layer 1
            gates1 = []
            for i in xrange(3): # forget, input and output gates
                gates1.append(T.nnet.sigmoid(U1[i][:,x] +
                                             W1[i].dot(ht1m1) +
                                             col_b1[i]))
            tentative_Ct1 = T.tanh(U1[3][:,x] + W1[3].dot(ht1m1) + col_b1[3])

            Ct1 = Ct1m1 * gates1[0] + tentative_Ct1 * gates1[1]
            ht1 = gates1[2] * T.tanh(Ct1)

            # layer 2
            gates2 = []
            for i in xrange(3): # forget, input and output gates
                gates2.append(T.nnet.sigmoid(U2[i].dot(ht1) +
                                             W2[i].dot(ht2m1) +
                                             col_b2[i]))
            tentative_Ct2 = T.tanh(U2[3].dot(ht1) + W2[3].dot(ht2m1) +
                                   col_b2[3])

            Ct2 = Ct2m1 * gates2[0] + tentative_Ct2 * gates2[1]
            ht2 = gates2[2] * T.tanh(Ct2)

            # final layer
            o = T.nnet.softmax((V.dot(ht2) + col_c).T)

            return [o, ht1, Ct1, ht2, Ct2]

        if self.batch_size > 1:
            ht1_Ct1_size = (num_units1, self.batch_size)
            ht2_Ct2_size = (num_units2, self.batch_size)
        else:
            ht1_Ct1_size = num_units1
            ht2_Ct2_size = num_units2

        [o, ht1, Ct1, ht2, Ct2], updates = theano.scan(
            fn=forward_prop,
            sequences=x,
            outputs_info=[None,
                            T.zeros(ht1_Ct1_size),
                            T.zeros(ht1_Ct1_size),
                            T.zeros(ht2_Ct2_size),
                            T.zeros(ht2_Ct2_size)
                           ],
            non_sequences=[self.p['U1'], self.p['W1'], self.p['b1'],
                           self.p['U2'], self.p['W2'], self.p['b2'],
                           self.p['V'], self.p['c']],
            truncate_gradient=self.lp['bptt_maxdepth'],
            strict=True)

        # o is a (seq_len, batch_size, char_dim) tensor---even if batch_size=1
        prediction = T.argmax(o, axis=2)

        self.theano_predict = theano.function(
            inputs=[x],
            outputs=[o, prediction],
        )

        if mfile is not None: # not here for learning; we can stop here
            return

        # compute the cross-entropy loss
        xent = (-y*T.log(o)).sum(axis=2) # (string_len, batch_size) matrix
        cost = T.mean(xent)

        # regularization using L1 and/or L2 norms
        reg_cost = cost

        # cast into theano.config.floatX is a trick to avoid float64 below
        tot_shape = (xent.shape[0] * xent.shape[1]).astype(theano.config.floatX)

        for param in self.p:
            if l1 > 0: # L1 regularization
                reg_cost += l1 * T.sum(abs(self.p[param])) / tot_shape
            if l2 > 0: # L2 regularization
                reg_cost += l2 * T.sum(self.p[param] ** 2) / tot_shape

        g = {}
        for param in self.p:
            g[param] = T.grad(reg_cost, self.p[param])

        # for rmsprop
        new_msq_g = {}
        updates = {}
        rmsprop_updates = []
        sgd_updates = []
        ratios = {}
        for param in self.p:
            new_msq_g[param] = (self.lp['drate'] * msq_g[param] +
                               (1. - self.lp['drate']) * g[param]**2)

            updates[param] = (self.dyn_lrate * g[param] /
                             (T.sqrt(new_msq_g[param]) + self.lp['eps']))

            # update to parameter scale ratio
            ratios[param] = (T.flatten(updates[param]).norm(2) /
                             T.flatten(self.p[param]).norm(2))

            sgd_updates.append((self.p[param],
                                self.p[param] - self.dyn_lrate * g[param]))

            rmsprop_updates.append((self.p[param],
                                    self.p[param] - updates[param]))
            rmsprop_updates.append((msq_g[param], new_msq_g[param]))

            # todo: add possibility to clip gradients to some value

        f_out = [cost, prediction]

        # compute cost and prediction but do not update the weights
        self.theano_check = theano.function(
            inputs=[x, y],
            outputs=f_out,
        )

        f_out.extend([ratios['U1'], ratios['W1'], ratios['b1'],
                      ratios['U2'], ratios['W2'], ratios['b2'],
                      ratios['V'], ratios['c']])

        # mini-batch training with rmsprop
        self.theano_train_rmsprop = theano.function(
            inputs=[x, y],
            outputs=f_out,
            updates=rmsprop_updates
        )

        # mini-batch training with stochastic gradient descent
        self.theano_train_sgd = theano.function(
            inputs=[x, y],
            outputs=f_out,
            updates=sgd_updates
        )
コード例 #24
0
    def setup_generate(self):
        print('{:25}'.format("Setup Generate"), end='', flush=True)

        self.generate_seed_input = T.btensor3()
        self.steps_to_simulate = T.iscalar()

        def step_time_seed(in_data, *hiddens):
            if self.dropout > 0:
                time_masks = [
                    1 - self.dropout for layer in self.time_model.layers
                ]
                time_masks[0] = None
            else:
                time_masks = []

            new_states = self.time_model.forward(in_data,
                                                 prev_hiddens=hiddens,
                                                 dropout=time_masks)
            return new_states

        time_inputs = self.generate_seed_input[0:-1]
        n_time, n_note, n_ipn = time_inputs.shape

        time_outputs_info_seed = [
            initial_state_with_taps(layer, n_note)
            for layer in self.time_model.layers
        ]
        time_result, _ = theano.scan(fn=step_time_seed,
                                     sequences=[time_inputs],
                                     outputs_info=time_outputs_info_seed)

        last_layer = get_last_layer(time_result)
        n_hidden = last_layer.shape[2]

        def step_time(*states):
            hiddens = list(states[:-2])
            in_data = states[-2]
            time = states[-1]

            if self.dropout > 0:
                masks = [1 - self.dropout for layer in self.time_model.layers]
                masks[0] = None
            else:
                masks = []

            new_states = self.time_model.forward(in_data,
                                                 prev_hiddens=hiddens,
                                                 dropout=masks)

            time_final = get_last_layer(new_states)

            start_note_values = theano.tensor.alloc(np.array(0, dtype=np.int8),
                                                    self.output_size)
            note_outputs_info = ([
                initial_state_with_taps(layer)
                for layer in self.pitch_model.layers
            ] + [dict(initial=start_note_values, taps=[-1])])

            notes_result, updates = theano.scan(fn=self._predict_step_note,
                                                sequences=[time_final],
                                                outputs_info=note_outputs_info)
            output = get_last_layer(notes_result)
            next_input = OutputFormToInputFormOp(self.data_manager)(output,
                                                                    time + 1)

            return (ensure_list(new_states) +
                    [next_input, time + 1, output]), updates

        time_outputs_info = (time_outputs_info_seed + [
            dict(initial=self.generate_seed_input[-1], taps=[-1]),
            dict(initial=n_time, taps=[-1]), None
        ])

        time_result, updates = theano.scan(fn=step_time,
                                           outputs_info=time_outputs_info,
                                           n_steps=self.steps_to_simulate)

        self.predicted_output = time_result[-1]

        self.generate_fun = theano.function(inputs=[
            self.steps_to_simulate, self.conservativity,
            self.generate_seed_input
        ],
                                            outputs=self.predicted_output,
                                            updates=updates,
                                            allow_input_downcast=True,
                                            on_unused_input='warn')

        print("Done")
コード例 #25
0
def BuildModel(modelSpecs, forTrain=True):
    rng = np.random.RandomState()

    ## x is for sequential features and y for matrix (or pairwise) features
    x = T.tensor3('x')
    y = T.tensor4('y')

    ## mask for x and y, respectively
    xmask = T.bmatrix('xmask')
    ymask = T.btensor3('ymask')

    xem = None
    ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ):
    if config.EmbeddingUsed(modelSpecs):
        xem = T.tensor3('xem')

## bounding box for crop of a big protein distance matrix. This box allows crop at any position.
    box = None
    if forTrain:
        box = T.ivector('boundingbox')

## trainByRefLoss can be either 1 or -1. When this variable exists, we train the model using both reference loss and the loss of real data
    trainByRefLoss = None
    if forTrain and config.TrainByRefLoss(modelSpecs):
        trainByRefLoss = T.iscalar('trainByRefLoss')

    distancePredictor = ResNet4DistMatrix(rng,
                                          seqInput=x,
                                          matrixInput=y,
                                          mask_seq=xmask,
                                          mask_matrix=ymask,
                                          embedInput=xem,
                                          boundingbox=box,
                                          modelSpecs=modelSpecs)

    ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] )
    labelList = []
    if forTrain:
        ## when this model is used for training. We need to define the label variable
        for response in modelSpecs['responses']:
            labelType = Response2LabelType(response)
            rValDims = GetResponseValueDims(response)

            if labelType.startswith('Discrete'):
                if rValDims > 1:
                    ## if one response is a vector, then we use a 4-d tensor
                    ## wtensor is for 16bit integer
                    labelList.append(T.wtensor4('Tlabel4' + response))
                else:
                    labelList.append(T.wtensor3('Tlabel4' + response))
            else:
                if rValDims > 1:
                    labelList.append(T.tensor4('Tlabel4' + response))
                else:
                    labelList.append(T.tensor3('Tlabel4' + response))

    ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen)
    weightList = []
    if len(labelList) > 0 and config.UseSampleWeight(modelSpecs):
        weightList = [
            T.tensor3('Tweight4' + response)
            for response in modelSpecs['responses']
        ]

## for prediction, both labelList and weightList are empty
    if forTrain:
        return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList, box, trainByRefLoss
    else:
        return distancePredictor, x, y, xmask, ymask, xem