def main(save_to, num_batches, continue_=False): mlp = MLP([Tanh(), Identity()], [1, 10, 1], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), seed=1) mlp.initialize() x = tensor.vector('numbers') y = tensor.vector('roots') cost = SquaredError().apply(y[:, None], mlp.apply(x[:, None])) cost.name = "cost" main_loop = MainLoop( GradientDescent( cost=cost, params=ComputationGraph(cost).parameters, step_rule=Scale(learning_rate=0.001)), get_data_stream(range(100)), model=Model(cost), extensions=([LoadFromDump(save_to)] if continue_ else []) + [Timing(), FinishAfter(after_n_batches=num_batches), DataStreamMonitoring( [cost], get_data_stream(range(100, 200)), prefix="test"), TrainingDataMonitoring([cost], after_epoch=True), Dump(save_to), Printing()]) main_loop.run() return main_loop
def main(save_to, num_batches, continue_=False): mlp = MLP([Tanh(), Identity()], [1, 10, 1], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), seed=1) mlp.initialize() x = tensor.vector('numbers') y = tensor.vector('roots') cost = SquaredError().apply(y[:, None], mlp.apply(x[:, None])) cost.name = "cost" main_loop = MainLoop( GradientDescent(cost=cost, params=ComputationGraph(cost).parameters, step_rule=Scale(learning_rate=0.001)), get_data_stream(range(100)), model=Model(cost), extensions=([LoadFromDump(save_to)] if continue_ else []) + [ Timing(), FinishAfter(after_n_batches=num_batches), DataStreamMonitoring( [cost], get_data_stream(range(100, 200)), prefix="test"), TrainingDataMonitoring([cost], after_epoch=True), Dump(save_to), Printing() ]) main_loop.run() return main_loop
def apply(self, input_, target): x_to_h = Linear(name='x_to_h', input_dim=self.dims[0], output_dim=self.dims[1] * 4) pre_rnn = x_to_h.apply(input_) pre_rnn.name = 'pre_rnn' rnn = LSTM(activation=Tanh(), dim=self.dims[1], name=self.name) h, _ = rnn.apply(pre_rnn) h.name = 'h' h_to_y = Linear(name='h_to_y', input_dim=self.dims[1], output_dim=self.dims[2]) y_hat = h_to_y.apply(h) y_hat.name = 'y_hat' cost = SquaredError().apply(target, y_hat) cost.name = 'MSE' self.outputs = {} self.outputs['y_hat'] = y_hat self.outputs['cost'] = cost self.outputs['pre_rnn'] = pre_rnn self.outputs['h'] = h # Initialization for brick in (rnn, x_to_h, h_to_y): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize()
def apply(self, input_, target): x_to_h = Linear(name='x_to_h', input_dim=self.dims[0], output_dim=self.dims[1] * 4) pre_rnn = x_to_h.apply(input_) pre_rnn.name = 'pre_rnn' rnn = LSTM(activation=Tanh(), dim=self.dims[1], name=self.name) h, _ = rnn.apply(pre_rnn) h.name = 'h' h_to_y = Linear(name='h_to_y', input_dim=self.dims[1], output_dim=self.dims[2]) y_hat = h_to_y.apply(h) y_hat.name = 'y_hat' cost = SquaredError().apply(target, y_hat) cost.name = 'MSE' self.outputs = {} self.outputs['y_hat'] = y_hat self.outputs['cost'] = cost self.outputs['pre_rnn'] = pre_rnn self.outputs['h'] = h # Initialization for brick in (rnn, x_to_h, h_to_y): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize()
def get_costs(presoft, args): if has_indices(args.dataset): # Targets: (Time X Batch) y = tensor.lmatrix('targets') y_mask = tensor.ones_like(y, dtype=floatX) y_mask = tensor.set_subtensor( y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) time, batch, feat = presoft.shape cross_entropy = Softmax().categorical_cross_entropy( (y.flatten() * y_mask.reshape((batch * time, ))), (presoft.reshape( (batch * time, feat)) * y_mask.reshape((batch * time, 1)))) # renormalization renormalized_cross_entropy = cross_entropy * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) # BPC: Bits Per Character unregularized_cost = renormalized_cross_entropy / tensor.log(2) unregularized_cost.name = "cross_entropy" else: # Targets: (Time X Batch X Features) y = tensor.tensor3('targets', dtype=floatX) y_mask = tensor.ones_like(y[:, :, 0], dtype=floatX) y_mask = tensor.set_subtensor( y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) if args.used_inputs is not None: y_mask = tensor.set_subtensor( y_mask[:args.used_inputs, :], tensor.zeros_like(y_mask[:args.used_inputs, :], dtype=floatX)) # SquaredError does not work on 3D tensor target = (y * y_mask.dimshuffle(0, 1, 'x')) values = (presoft[:-1, :, :] * y_mask.dimshuffle(0, 1, 'x')) target = target.reshape( (target.shape[0] * target.shape[1], target.shape[2])) values = values.reshape( (values.shape[0] * values.shape[1], values.shape[2])) unregularized_cost = SquaredError().apply(target, values) # renormalization unregularized_cost = unregularized_cost * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) unregularized_cost.name = "mean_squared_error" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = unregularized_cost + tensor.log(1) cost.name = "regularized_cost" return cost, unregularized_cost
def get_costs(presoft, args): if has_indices(args.dataset): # Targets: (Time X Batch) y = tensor.lmatrix('targets') y_mask = tensor.ones_like(y, dtype=floatX) y_mask = tensor.set_subtensor(y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) time, batch, feat = presoft.shape cross_entropy = Softmax().categorical_cross_entropy( (y.flatten() * y_mask.reshape((batch * time, ))), (presoft.reshape((batch * time, feat)) * y_mask.reshape((batch * time, 1)))) # renormalization renormalized_cross_entropy = cross_entropy * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) # BPC: Bits Per Character unregularized_cost = renormalized_cross_entropy / tensor.log(2) unregularized_cost.name = "cross_entropy" else: # Targets: (Time X Batch X Features) y = tensor.tensor3('targets', dtype=floatX) y_mask = tensor.ones_like(y[:, :, 0], dtype=floatX) y_mask = tensor.set_subtensor(y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) if args.used_inputs is not None: y_mask = tensor.set_subtensor(y_mask[:args.used_inputs, :], tensor.zeros_like(y_mask[:args.used_inputs, :], dtype=floatX)) # SquaredError does not work on 3D tensor target = (y * y_mask.dimshuffle(0, 1, 'x')) values = (presoft[:-1, :, :] * y_mask.dimshuffle(0, 1, 'x')) target = target.reshape((target.shape[0] * target.shape[1], target.shape[2])) values = values.reshape((values.shape[0] * values.shape[1], values.shape[2])) unregularized_cost = SquaredError().apply(target, values) # renormalization unregularized_cost = unregularized_cost * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) unregularized_cost.name = "mean_squared_error" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = unregularized_cost + tensor.log(1) cost.name = "regularized_cost" return cost, unregularized_cost
def train(self): x = self.sharedBatch['x'] x.name = 'x_myinput' xmini = self.sharedBatch['xmini'] xmini.name = 'xmini_myinput' y = self.sharedBatch['y'] y.name = 'y_myinput' # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(self.input_dimx, self.dim, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) xmini_to_h = Linear(self.input_dimxmini, self.mini_dim, name='xmini_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) rnnwmini = RNNwMini(dim=self.dim, mini_dim=self.mini_dim, summary_dim=self.summary_dim) h_to_o = Linear(self.summary_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) xmini_transform = xmini_to_h.apply(xmini) h = rnnwmini.apply(x=x_transform, xmini=xmini_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) #y_hat = Logistic().apply(y_hat) cost = SquaredError().apply(y, y_hat) cost.name = 'cost' rnnwmini.initialize() x_to_h.initialize() xmini_to_h.initialize() h_to_o.initialize() self.f = theano.function(inputs=[], outputs=y_hat) #print("self.f === ") #print(self.f()) #print(self.f().shape) #print("====") self.cg = ComputationGraph(cost) m = Model(cost) algorithm = GradientDescent(cost=cost, parameters=self.cg.parameters, step_rule=RMSProp(learning_rate=0.01), on_unused_sources='ignore') valid_monitor = DataStreamMonitoringShared( variables=[cost], data_stream=self.stream_valid_int, prefix="valid", sharedBatch=self.sharedBatch, sharedData=self.sharedData) train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) sharedVarMonitor = SwitchSharedReferences(self.sharedBatch, self.sharedData) tBest = self.track_best('valid_cost', self.cg) self.tracker = tBest[0] extensions = [sharedVarMonitor, valid_monitor] + tBest if self.debug: extensions.append(Printing()) self.algorithm = algorithm self.extensions = extensions self.model = m self.mainloop = MainLoop(self.algorithm, self.stream_train_int, extensions=self.extensions, model=self.model) self.main_loop(True)
lstm.biases_init = Constant(0.) lstm.initialize() #ComputationGraph(encode.apply(x)).get_theano_function()(features_test)[0].shape #ComputationGraph(lstm.apply(encoded)).get_theano_function()(features_test) #ComputationGraph(decode.apply(hiddens[-1])).get_theano_function()(features_test)[0].shape #ComputationGraph(SquaredError().apply(y, y_hat.flatten())).get_theano_function()(features_test, targets_test)[0].shape encoded = encode.apply(x) #hiddens = lstm.apply(encoded, gates.apply(x)) hiddens = lstm.apply(encoded) y_hat = decode.apply(hiddens[-1]) cost = SquaredError().apply(y, y_hat) cost.name = 'cost' #ipdb.set_trace() #ComputationGraph(y_hat).get_theano_function()(features_test)[0].shape #ComputationGraph(cost).get_theano_function()(features_test, targets_test)[0].shape cg = ComputationGraph(cost) #cg = ComputationGraph(hiddens).get_theano_function() #ipdb.set_trace() algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(5.0), Scale(0.01)]))
# MODEL SETUP textRNN = TextRNN(dim_in=VECTOR_SIZE, dim_hidden=HIDDEN_UNITS, dim_out=VECTOR_SIZE) output = textRNN.run(inputs=x) #get_states_and_output = T.function([x, x_mask], [output]) # COST SETUP #y_hat = np.float32(np.ones((3,1))) labels = np.float32([data[1] for data in dataset]) inputs_data = np.float32([data[0] for data in dataset]) test_labels = np.float32([data[1] for data in test_dataset]) test_inputs_data = np.float32([data[0] for data in test_dataset]) cost = SquaredError().apply(y, output) cost.name = 'MSE_with_regularization' cg = ComputationGraph(cost) #inputs = VariableFilter(roles=[INPUT], bricks=[SimpleRecurrent])(cg.variables) #inputs = [inputs[0]] #cg_dropout = apply_dropout(cg, inputs, 0.5) #fprop_dropout = T.function([cg_dropout.inputs], [cg_dropout.outputs[0]]) #dropped_out = VariableFilter(roles=[DROPOUT])(cg.variables) #inputs_referenced = [var.tag.replacement_of for var in dropped_out] #set(inputs) == set(inputs_referenced) get_states_and_output = T.function([x], [output]) #W = VariableFilter(roles=[WEIGHT])(cg.variables) #W = W #cost = cost + 0.005 * (W ** 2).sum()
def train_lstm(train, test, input_dim, hidden_dimension, columns, epochs, save_file, execution_name, batch_size, plot): stream_train = build_stream(train, batch_size, columns) stream_test = build_stream(test, batch_size, columns) # The train stream will return (TimeSequence, BatchSize, Dimensions) for # and the train test will return (TimeSequence, BatchSize, 1) x = T.tensor3('x') y = T.tensor3('y') y = y.reshape((y.shape[1], y.shape[0], y.shape[2])) # input_dim = 6 # output_dim = 1 linear_lstm = LinearLSTM(input_dim, 1, hidden_dimension, # print_intermediate=True, print_attrs=['__str__', 'shape']) y_hat = linear_lstm.apply(x) linear_lstm.initialize() c_test = AbsolutePercentageError().apply(y, y_hat) c_test.name = 'mape' c = SquaredError().apply(y, y_hat) c.name = 'cost' cg = ComputationGraph(c_test) def one_perc_min(current_value, best_value): if (1 - best_value / current_value) > 0.01: return best_value else: return current_value extensions = [] extensions.append(DataStreamMonitoring(variables=[c, c_test], data_stream=stream_test, prefix='test', after_epoch=False, every_n_epochs=100)) extensions.append(TrainingDataMonitoring(variables=[c_test], prefix='train', after_epoch=True)) extensions.append(FinishAfter(after_n_epochs=epochs)) # extensions.append(Printing()) # extensions.append(ProgressBar()) extensions.append(TrackTheBest('test_mape', choose_best=one_perc_min)) extensions.append(TrackTheBest('test_cost', choose_best=one_perc_min)) extensions.append(FinishIfNoImprovementAfter('test_cost_best_so_far', epochs=500)) # Save only parameters, not the whole main loop and only when best_test_cost is updated checkpoint = Checkpoint(save_file, save_main_loop=False, after_training=False) checkpoint.add_condition(['after_epoch'], predicate=OnLogRecord('test_cost_best_so_far')) extensions.append(checkpoint) if BOKEH_AVAILABLE and plot: extensions.append(Plot(execution_name, channels=[[ # 'train_cost', 'test_cost']])) step_rule = Adam() algorithm = GradientDescent(cost=c_test, parameters=cg.parameters, step_rule=step_rule) main_loop = MainLoop(algorithm, stream_train, model=Model(c_test), extensions=extensions) main_loop.run() test_mape = 0 if main_loop.log.status.get('best_test_mape', None) is None: with open(save_file, 'rb') as f: parameters = load_parameters(f) model = main_loop.model model.set_parameter_values(parameters) ev = DatasetEvaluator([c_test]) test_mape = ev.evaluate(stream_test)['mape'] else: test_mape = main_loop.log.status['best_test_mape'] return test_mape, main_loop.log.status['epochs_done']
hidden_dims = [int(dim) for dim in args.dim.split(",")] if args.batchnorm: network = BatchNormalizedMLP else: network = MLP autoencoder = network(activations=[Tanh() for _ in xrange(len(hidden_dims))] + [Identity()], dims=[input_dim] + hidden_dims + [input_dim], weights_init=Uniform(width=0.02), biases_init=Constant(0)) autoencoder.initialize() hopefully_states_again = autoencoder.apply(states) cost = SquaredError().apply(hopefully_states_again, states) cost.name = "squared_error" cost_model = Model(cost) algorithm = GradientDescent(cost=cost, parameters=cost_model.parameters, step_rule=Adam()) # handle data data = H5PYDataset(args.file, which_sets=("train",), load_in_memory=True) # trash data for testing """ dataraw = numpy.zeros((10000, 512), dtype="float32") for row in xrange(dataraw.shape[0]): dataraw[row] = numpy.random.rand(512) data = OrderedDict() data["act_seqs"] = dataraw data = IndexableDataset(data)
lstm.biases_init = Constant(0.) lstm.initialize() #ComputationGraph(encode.apply(x)).get_theano_function()(features_test)[0].shape #ComputationGraph(lstm.apply(encoded)).get_theano_function()(features_test) #ComputationGraph(decode.apply(hiddens[-1])).get_theano_function()(features_test)[0].shape #ComputationGraph(SquaredError().apply(y, y_hat.flatten())).get_theano_function()(features_test, targets_test)[0].shape encoded = encode.apply(x) #hiddens = lstm.apply(encoded, gates.apply(x)) hiddens = lstm.apply(encoded) y_hat = decode.apply(hiddens[-1]) cost = SquaredError().apply(y, y_hat) cost.name = 'cost' #ipdb.set_trace() #ComputationGraph(y_hat).get_theano_function()(features_test)[0].shape #ComputationGraph(cost).get_theano_function()(features_test, targets_test)[0].shape cg = ComputationGraph(cost) #cg = ComputationGraph(hiddens).get_theano_function() #ipdb.set_trace() algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule( [StepClipping(5.0), Scale(0.01)]))