0 ) ) # Construct the network net.layer_opts['filter_shape'] = (3,1,8,8) net.content['l1'] = ConvLayer(net, net.content['input']) net.layer_opts['filter_shape'] = (3,3,1,1) net.content['l2'] = ConvLayer(net, net.content['l1']) net.layer_opts['softmax_norm_dim'] = 1 net.content['l3'] = SoftmaxLayer(net, net.content['l2']) net.content['cost'] = CategoricalCrossEntropy(net, net.content['l3']) # Print the network architecture net.simpleprint() # Initialize learning rate for each updatable layer net.InitLR(0.5) # Create params list, grad list, momentum list for the theano function to update trainer.InitParams(net) trainer.opts['validation'] = False trainer.opts['test_emp'] = False # Update rule train_update_rule = trainer.InitUpdateRule(net) net.InitTrainFunction(train_update_rule, input, expected_output, ['l3']) main_loop = SGDRMainLoop(net) main_loop.run(net, trainer)
def train_Attend_224(): trained_path = '../../data/trained_model/' cap_data_path = "../../data/mscoco/MSCOCO_processed/MSCOCO_224_capdata_train_%d.h5" img_data_path = "../../data/mscoco/MSCOCO_processed/MSCOCO_224_imgdata_train_%d.h5" val_cap_data_path = "../../data/mscoco/MSCOCO_processed/MSCOCO_224_capdata_val_%d.h5" val_img_data_path = "../../data/mscoco/MSCOCO_processed/MSCOCO_224_imgdata_val_%d.h5" fourth_cv_mv = "../../data/mscoco/MSCOCO_processed/4thconvo_meanvar.dat" [relu_mean, relu_std] = LoadList(fourth_cv_mv) relu_mean = theano.shared(relu_mean.astype(theano.config.floatX)) relu_std = theano.shared(relu_std.astype(theano.config.floatX)) # LSTM params n_word = 1004 max_len = 40 memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info() #print('Memory: %.2f avail before putting train data to shared' % (memory[0]/1024./1024/1024)) #create net net = ShowTellNet() net = LoadVGG_Attend(net) net.name = "ShowAttendTellCOCO_Re14e-5_deep_out_context_dim_512" #net.name = "ShowAttendTellBugFind" snapshot_list = glob.glob(trained_path + net.name + '*.dat') num_big_epoch = 5000 big_batch_size = np.asarray([2000], dtype=theano.config.floatX) if (len(snapshot_list) == 0): # Trainer params trainer = Trainer() trainer.opts['batch_size'] = 20 trainer.opts['save'] = False trainer.opts['save_freq'] = 2 #trainer.opts['num_sample'] = num_sample #trainer.opts['num_val_sample'] = num_val_sample trainer.opts['validation'] = False trainer.opts['num_epoch'] = 1 trainer.opts['dzdw_norm_thres'] = 1 trainer.opts['dzdb_norm_thres'] = 0.01 net.layer_opts['updatable'] = True # Setting params net.net_opts['l1_learning_rate'] = np.asarray(0.005, theano.config.floatX) net.reset_opts['min_lr'] = np.asarray(0.005, dtype=theano.config.floatX) net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate'] #Constructing LSTM_ATTEND network from image_feature_region step-by-step # step 1: net.content['pool4'] reshape to (N, 196, 512) tensor - image_feature_region # step 2: using (N, 196, 512) image_feature_region tensor as input to compute h0, c0 - initial state memory of LSTM_ATTEND # step 4: construct DeepOutLayer from h_t, z_t output from LSTM_ATTEND layer # step 5: using DeepOutLayer output to compute output vector (instead of h_t), then negative log likelihood calculated by SoftMaxLogLoss Layer #pdb.set_trace() feature_shape = net.content['relu5_3'].output.shape new_shape = (feature_shape[0], feature_shape[1], T.prod(feature_shape[2:])) #pdb.set_trace() #net.content['relu5_3_norm'] = NormLayer(net, net.content['relu5_3'], relu_mean, relu_std) net.content['4th_convol_feature_region'] = ReshapeLayer( net, net.content['relu5_3'], new_shape) # Adding dropout to VGG output net.content['4th_convol_feature_region'] = DropOut( net, net.content['4th_convol_feature_region'], 0.2) net.layer_opts['num_region'] = 196 net.content['average_feature_region'] = AverageLayer( net, net.content['4th_convol_feature_region'], 2) # Done # avg_out = net.content['average_feature_region'].output.eval({net.input[0]:X.eval()}) net.layer_opts['num_lstm_node'] = 512 input_shape_h0 = (1, 512) output_shape_h0 = (1, net.layer_opts['num_lstm_node']) n_hidden_h0 = 512 #GENERATING H0 # net.content['h0_initial'] = MLPLayer(net, net.content['average_feature_region'], # input_shape = input_shape_h0, output_shape= output_shape_h0,n_hidden= n_hidden_h0) net.layer_opts['num_fc_node'] = n_hidden_h0 net.content['h0_hidden_layer'] = FCLayer( net, net.content['average_feature_region'], input_shape_h0, T.tanh) net.layer_opts['num_fc_node'] = output_shape_h0[1] hidden_shape = (input_shape_h0[1], n_hidden_h0) net.content['h0_initial'] = FCLayer(net, net.content['h0_hidden_layer'], hidden_shape) out_shape = net.content['h0_initial'].output.shape net.content['h0_initial'].output = net.content[ 'h0_initial'].output.reshape((-1, out_shape[0], out_shape[1])) # h0_init_out =net.content['h0_initial'].output.eval({net.input[0]: X.eval()}) #GENERATING C0 # net.content['c0_initial'] = MLPLayer(net, net.content['average_feature_region'], # input_shape = input_shape_h0, output_shape = output_shape_h0,n_hidden= n_hidden_h0) net.layer_opts['num_fc_node'] = n_hidden_h0 net.content['c0_hidden_layer'] = FCLayer( net, net.content['average_feature_region'], input_shape_h0, T.tanh) net.layer_opts['num_fc_node'] = output_shape_h0[1] net.content['c0_initial'] = FCLayer(net, net.content['c0_hidden_layer'], hidden_shape) out_shape = net.content['c0_initial'].output.shape net.content['c0_initial'].output = net.content[ 'c0_initial'].output.reshape((-1, out_shape[0], out_shape[1])) #Word Embedding Layer net.layer_opts['num_emb'] = 400 net.content['we'] = WordEmbLayer( net, net.content['input_sen'], (trainer.opts['batch_size'], max_len - 1, n_word, 1)) we_shape = net.content['we'].output.shape net.content['we'].output = net.content['we'].output.reshape( (we_shape[0], we_shape[1], we_shape[2], -we_shape[3])) net.content['we_dropout'] = DropOut(net, net.content['we'], 0.1) net.layer_opts['num_lstm_node'] = 512 # net.layer_opts['context_dim'] = 512 net.layer_opts['num_dimension_feature'] = 512 net.layer_opts['num_region'] = 196 net.content['4th_convol_feature_region'].output = T.transpose( net.content['4th_convol_feature_region'].output, (0, 2, 1)) # X = np.zeros((2,3,224,224),dtype=np.float32) # Y = np.zeros((2,max_len,n_word,1), dtype=np.float32) # im_f_feature = net.content['4th_convol_feature_region'].output.eval({ # net.input[0]:X # }) # we_out = net.content['we'].output.eval({net.input[1]:Y}) # pdb.set_trace() net.content['lstm_attend'] = LSTM_Attend( net, net.content['we_dropout'], (trainer.opts['batch_size'], max_len - 1, net.layer_opts['num_emb'], 1), net.content['4th_convol_feature_region'].output, initial_h0=net.content['h0_initial'].output, initial_c0=net.content['c0_initial'].output) #we_out = we_out, f_region=f_region) net.layer_opts[ 'num_deep_out_node'] = 400 #the same number with word embedding layer net.layer_opts["n_word"] = n_word net.content['deep_out_layer'] = DeepOutputLayer( net, net.content['we_dropout'], net.content['lstm_attend']) net.layer_opts['num_affine_node'] = n_word net.layer_opts['l2_term'] = 0.000014 net.content['l2'] = L2WeightDecay(net, net.content['deep_out_layer']) net.layer_opts['softmax_norm_dim'] = 2 net.content['smloss'] = SoftmaxLogLoss(net, net.content['deep_out_layer']) net.content['cost'] = AggregateSumLoss( [net.content['l2'], net.content['smloss']]) net.InitLR(0.2) memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info() print('Memory: %.2f avail before initialize params' % (memory[0] / 1024. / 1024 / 1024)) trainer.InitParams(net) print("Done init params") train_update_rule = trainer.InitUpdateRule(net) print("Done init update rule") additional_output = ['deep_out_layer', 'l2'] # net.InitValFunction([val_X, val_Y[:,:-1,:,:]], val_Y[:,1:,:,:], # additional_output, val_weight, net.content['lstm_attend'].output_z) e = 0 last_big_e = 0 else: snapshot_list = sorted(snapshot_list) print('Loading latest snapshot at %s' % snapshot_list[-1]) e = 0 [net, trainer, last_big_e] = LoadList(snapshot_list[-1]) net.layer_opts['l2_term'] = 0.000014 net.content['l2'] = L2WeightDecay(net, net.content['deep_out_layer']) net.content['cost'] = AggregateSumLoss( [net.content['l2'], net.content['smloss']]) net.InitLR(0.2) memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info() print('Memory: %.2f avail before initialize params' % (memory[0] / 1024. / 1024 / 1024)) trainer.InitParams(net) print("Done init params") train_update_rule = trainer.InitUpdateRule(net) print("Done init update rule") additional_output = ['deep_out_layer', 'l2'] for big_e in range(last_big_e + 1, num_big_epoch): # Load train data h_list = range(11) np.random.shuffle(h_list) for h in h_list: #break #if (not ('train_X' in locals())): train_X = LoadH5(img_data_path % h) dict_key = train_X.keys()[0] train_X = train_X[dict_key] num_sample = train_X.shape[0] # train_Y has the shape of (num_sample, 5, max_len, n_word, 1) train_Y = LoadH5(cap_data_path % h) dict_key = train_Y.keys()[0] train_Y = train_Y[dict_key] Y_shape = train_Y.shape # For debugging #train_X = train_X[0:100,:,:,:] #train_Y = train_Y[0:100,:,:,:,:] #num_sample = 100 #train_Y = train_Y.reshape(5*num_sample, Y_shape[2], Y_shape[3], 1) #random_caption_idx = net.net_opts['rng'].randint(0,5,num_sample) + np.asarray([i*5 for i in range(num_sample)]) # Each image has 5 captions, pick one at random #train_Y = train_Y[random_caption_idx, :, :, :] train_Y = train_Y[:, 0, :, :, :] train_Y = train_Y.astype(theano.config.floatX) # Create weight from train_Y train_weight = np.copy(train_Y) train_weight = train_weight[:, 1:, :, :] weight_shape = train_weight.shape train_weight = (train_weight[:, :, 0, 0] == 0).reshape( weight_shape[0], weight_shape[1], 1, 1) train_weight = np.repeat(train_weight, weight_shape[2], 2) train_weight = np.repeat(train_weight, weight_shape[3], 3) train_weight = train_weight.astype(theano.config.floatX) num_big_batch_iteration = np.ceil( np.asarray(num_sample, dtype=theano.config.floatX) / big_batch_size) for j in range(0, num_big_batch_iteration): big_batch_range = np.arange(j * big_batch_size, (j + 1) * big_batch_size) if ((j + 1) * big_batch_size > num_sample): big_batch_range = np.arange(j * big_batch_size, num_sample) trainer.opts['num_sample'] = big_batch_range.shape[0] big_batch_range = np.asarray(big_batch_range, dtype=np.uint32) np.random.shuffle(big_batch_range) memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info( ) print( 'Memory: %.2f avail before putting train data to shared' % (memory[0] / 1024. / 1024 / 1024)) train_Xj = theano.shared(train_X[big_batch_range, :, :, :]) train_Yj = theano.shared(train_Y[big_batch_range, :, :, :]) hash_weight = np.asarray([1.3**t for t in range(max_len)]) hash_value = np.sum( np.argmax(train_Yj[0, :, :, 0].eval(), axis=1) * hash_weight) print(hash_value) #pdb.set_trace() train_weightj = theano.shared( train_weight[big_batch_range, :, :, :]) memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info( ) print('Memory: %.2f avail after' % (memory[0] / 1024. / 1024 / 1024)) #val_Xtest = train_Xj.eval()[0:2,:,:,:] #val_Ytest = train_Yj.eval()[0:2,:-1,:,:] #z_m1_dummy = np.zeros((1, 2, net.content['lstm_attend'].Z_shape[0]), dtype=theano.config.floatX) #pdb.set_trace() #relu5_3norm = net.content['relu5_3_norm'].output.eval({net.input[0]: val_Xtest}) #relu5_3 = net.content['relu5_3'].output.eval({net.input[0]: val_Xtest}) #h_out = net.content['lstm_attend'].output.eval({ # net.input[0]: val_Xtest, # net.input[1]: val_Ytest, # #net.content['lstm_attend'].z_m1_sym # }) #z_out = net.content['lstm_attend'].output_z.eval({ # net.input[0]: val_Xtest, # net.input[1]: val_Ytest, # #net.content['lstm_attend'].z_m1_sym # }) #c_out = net.content['lstm_attend'].output_c.eval({ # net.input[0]: val_Xtest, # net.input[1]: val_Ytest, # #net.content['lstm_attend'].z_m1_sym # }) #deep_out0 = net.content['deep_out_layer'].output.eval({ \ # net.input[0]: val_Xtest, \ # net.input[1]: val_Ytest, \ # net.content['lstm_attend'].z_m1_sym: z_m1_dummy \ #}) #fourth_cv_out = net.content['4th_convol_feature_region'].output.eval({\ # net.input[0]: val_Xtest, \ #}) #avg_feature = net.content['average_feature_region'].output.eval({\ # net.input[0]: val_Xtest, \ #}) # #h0_init = net.content['h0_initial'].output.eval({\ # net.input[0]: val_Xtest # }) #img_out = net.content['lstm_attend'].img_out.eval({\ # net.input[0]: val_Xtest,\ # }) #pdb.set_trace() net.InitTrainFunction(train_update_rule, [train_Xj, train_Yj[:, :-1, :, :]], train_Yj[:, 1:, :, :], additional_output, train_weightj) print("Done init train function") print("start training") trainer.opts['validation'] = False trainer.opts['train'] = True main_loop = SGDRMainLoop(net, trained_path) main_loop.run(net, trainer, e) del train_Xj del train_Yj del train_weightj del net.train_function train_Xj = None train_Yj = None train_weightj = None net.train_function = None print('Finished iteration %d, h5 %d, of big epoch %d' % (j, h, big_e)) plt.figure() plt.plot(trainer.all_i[-1000::5]) plt.savefig('SAT14e-5_all_i_last1000.png') plt.close() plt.figure() plt.plot(trainer.all_i) plt.savefig('SAT14e-5_all_i.png') plt.close() if (big_e % trainer.opts['save_freq'] == 0): net1 = net.NNCopy() SaveList([net1, trainer, big_e], '../../data/trained_model/%s_e-%05d.dat' % (net.name, big_e)) # Validating frequency is the same with save freq if (big_e % trainer.opts['save_freq'] == 0): for h in range(2): # Max is 6 val_X = LoadH5(val_img_data_path % h) dict_key = val_X.keys()[0] val_X = val_X[dict_key] num_val_sample = val_X.shape[0] # val_Y has the shape of (num_val_sample, 5, max_len, n_word, 1) val_Y = LoadH5(val_cap_data_path % h) dict_key = val_Y.keys()[0] val_Y = val_Y[dict_key] Y_shape = val_Y.shape val_Y = val_Y.reshape(5 * num_val_sample, Y_shape[2], Y_shape[3], 1) random_caption_idx = net.net_opts['rng'].randint( 0, 5, num_val_sample) + np.asarray( [i * 5 for i in range(num_val_sample)]) # Each image has 5 captions, pick one at random val_Y = val_Y[random_caption_idx, :, :, :] val_Y = val_Y.astype(theano.config.floatX) # Create weight from val_Y val_weight = np.copy(val_Y) val_weight = val_weight[:, 1:, :, :] weight_shape = val_weight.shape val_weight = (val_weight[:, :, 0, 0] == 0).reshape( weight_shape[0], weight_shape[1], 1, 1) val_weight = np.repeat(val_weight, weight_shape[2], 2) val_weight = np.repeat(val_weight, weight_shape[3], 3) val_weight = val_weight.astype(theano.config.floatX) num_big_batch_iteration = np.ceil( np.asarray(num_val_sample, dtype=theano.config.floatX) / big_batch_size) for j in range(0, num_big_batch_iteration): big_batch_range = np.arange(j * big_batch_size, (j + 1) * big_batch_size) if ((j + 1) * big_batch_size > num_val_sample): big_batch_range = np.arange(j * big_batch_size, num_val_sample) trainer.opts['num_val_sample'] = big_batch_range.shape[0] big_batch_range = np.asarray(big_batch_range, dtype=np.uint32) memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info( ) print( 'Memory: %.2f avail before putting val data to shared' % (memory[0] / 1024. / 1024 / 1024)) val_Xj = theano.shared(val_X[big_batch_range, :, :, :]) val_Yj = theano.shared(val_Y[big_batch_range, :, :, :]) hash_weight = np.asarray([1.3**t for t in range(max_len)]) hash_value = np.sum( np.argmax(val_Yj[0, :, :, 0].eval(), axis=1) * hash_weight) print(hash_value) val_weightj = theano.shared( val_weight[big_batch_range, :, :, :]) memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info( ) print('Memory: %.2f avail after' % (memory[0] / 1024. / 1024 / 1024)) net.InitValFunction([val_Xj, val_Yj[:, :-1, :, :]], val_Yj[:, 1:, :, :], additional_output, val_weightj) print("Done init val function") print("start validating") trainer.opts['validation'] = True trainer.opts['train'] = False main_loop = SGDRMainLoop(net, trained_path) main_loop.run(net, trainer, e) del val_Xj del val_Yj del val_weightj del net.val_function val_Xj = None val_Yj = None val_weightj = None net.val_function = None print( 'Finished validating at iteration %d, h5 %d, of big epoch %d' % (j, h, big_e))
def train_Attend_224(): trained_path = '../../data/trained_model/' # LSTM params n_word = 2000 max_len = 40 train_X, train_Y, train_weight, val_X, val_Y, val_weight = CreateDataFlick224( n_word) pdb.set_trace() #create net net = ShowTellNet() net.name = "ShowAttendTell" snapshot_list = glob.glob(trained_path + net.name + '*.dat') X = train_X[0:2, :, :, :] Y = train_Y[0:2, :, :, :] input_Y = train_Y[:, :-1, :, :] expected_Y = train_Y[:, 1:, :, :] weight = train_weight[0:2, :, :, :] num_sample = 6000 num_big_epoch = 100 big_batch_size = np.asarray([2000], dtype=theano.config.floatX) num_big_batch_iteration = np.ceil( np.asarray(num_sample, dtype=theano.config.floatX) / big_batch_size) if (len(snapshot_list) == 0): # Trainer params trainer = Trainer() trainer.opts['batch_size'] = 20 trainer.opts['save'] = False trainer.opts['save_freq'] = 20 trainer.opts['num_sample'] = 2000 trainer.opts['num_val_sample'] = 1000 trainer.opts['validation'] = False trainer.opts['num_epoch'] = 1 trainer.opts['dzdw_norm_thres'] = 1 trainer.opts['dzdb_norm_thres'] = 0.01 net = LoadVGG_Attend(net) net.layer_opts['updatable'] = True # Setting params net.net_opts['l1_learning_rate'] = np.asarray(0.005, theano.config.floatX) net.reset_opts['min_lr'] = np.asarray(0.005, dtype=theano.config.floatX) net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate'] #Constructing LSTM_ATTEND network from image_feature_region step-by-step # step 1: net.content['pool4'] reshape to (N, 196, 512) tensor - image_feature_region # step 2: using (N, 196, 512) image_feature_region tensor as input to compute h0, c0 - initial state memory of LSTM_ATTEND # step 3: construct LSTM_ATTEND from h0, c0 (kwargs) and (N, 196, 512) image_feature_region tensor # step 4: construct DeepOutLayer from h_t, z_t output from LSTM_ATTEND layer # step 5: using DeepOutLayer output to compute output vector (instead of h_t), then negative log likelihood calculated by SoftMaxLogLoss Layer feature_shape = net.content['relu5_3'].output.shape new_shape = (feature_shape[0], feature_shape[1], T.prod(feature_shape[2:])) net.content['4th_convol_feature_region'] = ReshapeLayer( net, net.content['relu5_3'], new_shape) #net.content['pool4'].output.reshape() # Done # pdb.set_trace() # convol_out = net.content['4th_convol_feature_region'].output.eval({net.input[0]: X.eval()}) # pdb.set_trace() net.layer_opts['num_region'] = 196 # pdb.set_trace() net.content['average_feature_region'] = AverageLayer( net, net.content['4th_convol_feature_region'], 2) # Done # avg_out = net.content['average_feature_region'].output.eval({net.input[0]:X.eval()}) net.layer_opts['num_lstm_node'] = 512 input_shape_h0 = (1, 512) output_shape_h0 = (1, net.layer_opts['num_lstm_node']) n_hidden_h0 = 512 #GENERATING H0 # net.content['h0_initial'] = MLPLayer(net, net.content['average_feature_region'], # input_shape = input_shape_h0, output_shape= output_shape_h0,n_hidden= n_hidden_h0) net.layer_opts['num_fc_node'] = n_hidden_h0 net.content['h0_hidden_layer'] = FCLayer( net, net.content['average_feature_region'], input_shape_h0, T.tanh) net.layer_opts['num_fc_node'] = output_shape_h0[1] hidden_shape = (input_shape_h0[1], n_hidden_h0) net.content['h0_initial'] = FCLayer(net, net.content['h0_hidden_layer'], hidden_shape) out_shape = net.content['h0_initial'].output.shape net.content['h0_initial'].output = net.content[ 'h0_initial'].output.reshape((-1, out_shape[0], out_shape[1])) # pdb.set_trace() # h0_init_out =net.content['h0_initial'].output.eval({net.input[0]: X.eval()}) # pdb.set_trace() #GENERATING C0 # net.content['c0_initial'] = MLPLayer(net, net.content['average_feature_region'], # input_shape = input_shape_h0, output_shape = output_shape_h0,n_hidden= n_hidden_h0) net.layer_opts['num_fc_node'] = n_hidden_h0 net.content['c0_hidden_layer'] = FCLayer( net, net.content['average_feature_region'], input_shape_h0, T.tanh) net.layer_opts['num_fc_node'] = output_shape_h0[1] net.content['c0_initial'] = FCLayer(net, net.content['c0_hidden_layer'], hidden_shape) out_shape = net.content['c0_initial'].output.shape net.content['c0_initial'].output = net.content[ 'c0_initial'].output.reshape((-1, out_shape[0], out_shape[1])) #Word Embedding Layer net.layer_opts['num_emb'] = 512 net.content['we'] = WordEmbLayer( net, net.content['input_sen'], (trainer.opts['batch_size'], max_len - 1, n_word, 1)) # pdb.set_trace() # we_out = net.content['we'].output.eval({net.input[1]: Y.eval()}) # pdb.set_trace() net.layer_opts['num_lstm_node'] = 512 # net.layer_opts['context_dim'] = 1024 net.layer_opts['num_dimension_feature'] = 512 net.layer_opts['num_region'] = 196 net.content['4th_convol_feature_region'].output = T.transpose( net.content['4th_convol_feature_region'].output, (0, 2, 1)) net.content['lstm_attend'] = LSTM_Attend( net, net.content['we'], (trainer.opts['batch_size'], max_len - 1, net.layer_opts['num_emb'], 1), net.content['4th_convol_feature_region'].output, initial_h0=net.content['h0_initial'].output, initial_c0=net.content['c0_initial'].output) # pdb.set_trace() # lstm_out = net.content['lstm_attend'].output.eval({net.input[0]: X.eval(), # net.input[1]:Y.eval(), # net.content['lstm_attend'].z_m1_sym: np.zeros((1, 2, net.layer_opts['num_dimension_feature']), dtype=theano.config.floatX)}) # print(lstm_out[0].shape) # print(lstm_out[1].shape) # # print(lstm_out[2].shape) # pdb.set_trace() net.layer_opts['num_deep_out_node'] = 512 #300 net.layer_opts["n_word"] = n_word net.content['deep_out_layer'] = DeepOutputLayer( net, net.content['we'], net.content['lstm_attend']) # net.layer_opts['num_affine_node'] = n_word # net.content['deep_out_layer'] = AffineLayer(net, net.content['lstm_attend'], # (trainer.opts['batch_size'], # max_len - 1, # net.layer_opts['num_lstm_node'], # 1)) # pdb.set_trace() # deep_out = net.content['deep_out_layer'].output.eval({net.input[0]: X.eval(), # net.input[1]: Y.eval(), # net.content['lstm_attend'].z_m1_sym: np.zeros((1, 2, net.layer_opts['num_dimension_feature']), dtype=theano.config.floatX)}) net.layer_opts['l2_term'] = 0.125 net.content['l2'] = L2WeightDecay(net, net.content['deep_out_layer']) net.layer_opts['softmax_norm_dim'] = 2 net.content['smloss'] = SoftmaxLogLoss(net, net.content['deep_out_layer']) net.content['cost'] = AggregateSumLoss( [net.content['l2'], net.content['smloss']]) # pdb.set_trace() # print(X.eval().shape) # print(Y.eval().shape) # print(weight.eval().shape) # logloss_out = net.content['cost'].output.eval({net.input[0]: X.eval(), # net.input[1]: input_Y.eval(), # net.output[0]: expected_Y.eval(), # net.weight[0]: weight.eval(), # net.content['lstm_attend'].z_m1_sym: np.zeros((1, 2, net.layer_opts['num_dimension_feature']), dtype=theano.config.floatX)}) # print("Done creating layer") # pdb.set_trace() net.InitLR(0.2) trainer.InitParams(net) print("Done init params") train_update_rule = trainer.InitUpdateRule(net) print("Done init update rule") additional_output = [ 'input_sen', 'deep_out_layer', 'we', 'lstm_attend' ] # net.InitValFunction([val_X, val_Y[:,:-1,:,:]], val_Y[:,1:,:,:], # additional_output, val_weight, net.content['lstm_attend'].output_z) e = 0 last_big_e = 0 else: snapshot_list = sorted(snapshot_list) print('Loading latest snapshot at %s' % snapshot_list[-1]) for big_e in range(last_big_e, num_big_epoch): for j in range(0, num_big_batch_iteration): big_batch_range = np.arange(j * big_batch_size, (j + 1) * big_batch_size) if ((j + 1) * big_batch_size > num_sample): big_batch_range = np.arange(j * big_batch_size, num_sample) trainer.opts['num_sample'] = big_batch_range.shape[0] big_batch_range = np.asarray(big_batch_range, dtype=np.uint32) memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info() print('Memory: %.2f avail before putting train data to shared' % (memory[0] / 1024. / 1024 / 1024)) train_Xj = theano.shared(train_X[big_batch_range, :, :, :]) train_Yj = theano.shared(train_Y[big_batch_range, :, :, :]) train_weightj = theano.shared( train_weight[big_batch_range, :, :, :]) memory = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray.mem_info() print('Memory: %.2f avail after' % (memory[0] / 1024. / 1024 / 1024)) net.InitTrainFunction(train_update_rule, [train_Xj, train_Yj[:, :-1, :, :]], train_Yj[:, 1:, :, :], additional_output, train_weightj, net.weight[0]) print("Done init train function") # net.InitValFunction([val_X, val_Y[:,:-1,:,:]], val_Y[:,1:,:,:], additional_output, val_weight) # print("Done init val function") print("start training") trainer.opts['validation'] = False trainer.opts['train'] = True main_loop = SGDRMainLoop(net, trained_path) main_loop.run(net, trainer, e) train_Xj = None train_Yj = None train_weightj = None net.train_function = None print('Finished iteration %d of big epoch %d' % (j, big_e))
def train(): trained_path = '../../data/trained_model/' # LSTM params n_word = 2000 max_len = 40 # Create net net = ShowTellNet() net.name = 'ShowTellCheck' #net.name = 'abc' # Find latest snapshot snapshot_list = glob.glob(trained_path + net.name + '*.dat') if (len(snapshot_list) == 0): train_X, train_Y, train_weight, val_X, val_Y, val_weight = CreateData( n_word) #train_X = theano.shared(train_X.eval()[0:200,:,:,:]) #train_Y = theano.shared(train_Y.eval()[0:200,:,:,:]) # Trainer params trainer = Trainer() trainer.opts['batch_size'] = 32 trainer.opts['save'] = False trainer.opts['save_freq'] = 20 trainer.opts['num_sample'] = 200 trainer.opts['num_val_sample'] = 1000 trainer.opts['validation'] = False trainer.opts['num_epoch'] = 10000 trainer.opts['dzdw_norm_thres'] = 1 trainer.opts['dzdb_norm_thres'] = 0.01 # Load VGG net = LoadVGG(net) net.layer_opts['updatable'] = True # Setting params net.net_opts['l1_learning_rate'] = np.asarray(0.005, theano.config.floatX) net.reset_opts['min_lr'] = np.asarray(0.005, dtype=theano.config.floatX) net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate'] # Construct the network net.layer_opts['num_fc_node'] = 512 # net.layer_opts['num_fc_node'] = 128 # net.content['fc6'] = FCLayer(net, net.content['pool5'], (1, 512, 2, 2)) net.content['fc6'] = FCLayer(net, net.content['pool5'], (1, 512, 4, 4)) net.content['fc6_swap'] = SwapDim(net, net.content['fc6'], 1, 2) net.layer_opts['num_emb'] = 512 # net.layer_opts['num_emb'] = 128 net.content['we'] = WordEmbLayer( net, net.content['input_sen'], (trainer.opts['batch_size'], max_len - 1, n_word, 1)) net.content['cat'] = Concat(net, net.content['fc6_swap'], net.content['we'], 1) net.layer_opts['num_lstm_node'] = n_word net.content['lstm'] = LSTM(net, net.content['cat'], (trainer.opts['batch_size'], max_len - 1, net.layer_opts['num_emb'], 1)) ################ # TESTING LSTM # ################ # h_dummy = np.zeros((1, 1, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX) # c_dummy = np.zeros((1, 1, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX) # h_dummy2 = np.zeros((1, 2, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX) # c_dummy2 = np.zeros((1, 2, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX) h_dummy5 = np.zeros((1, 5, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX) c_dummy5 = np.zeros((1, 5, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX) # cat = net.content['cat'].output.eval({net.input[0]:X , net.input[1]: Y}) # cat = np.reshape(cat, (2, 41, 128)) # cat0 = np.reshape(cat[1,0,:], (1,1,128)) # cat1 = np.reshape(cat[1,1,:], (1,1,128)) # cat2 = np.reshape(cat[1,2,:], (1,1,128)) # # x0 = cat[0,0,:].reshape(1,1,128) # x1 = cat[0,1,:].reshape(1,1,128) # x2 = cat[0,2,:].reshape(1,1,128) # Wi = net.content['lstm'].W['i'].eval() # Wf = net.content['lstm'].W['f'].eval() # Wc = net.content['lstm'].W['c'].eval() # Wo = net.content['lstm'].W['o'].eval() # # Ui = net.content['lstm'].U['i'].eval() # Uf = net.content['lstm'].U['f'].eval() # Uc = net.content['lstm'].U['c'].eval() # Uo = net.content['lstm'].U['o'].eval() # # bi = net.content['lstm'].b['i'].eval() # bf = net.content['lstm'].b['f'].eval() # bc = net.content['lstm'].b['c'].eval() # bo = net.content['lstm'].b['o'].eval() # hm1 = h_dummy # cm1 = c_dummy # # # First iteration # i0 = npsigmoid(np.dot(x0, Wi) + np.dot(hm1, Ui) + bi) # f0 = npsigmoid(np.dot(x0, Wf) + np.dot(hm1, Uf) + bf) # o0 = npsigmoid(np.dot(x0, Wo) + np.dot(hm1, Uo) + bo) # c0 = f0*cm1 + i0*np.tanh(np.dot(x0, Wc) + np.dot(hm1, Uc) + bc) # h0 = o0*c0 # # # 2nd iteration # i1 = npsigmoid(np.dot(x1, Wi) + np.dot(h0, Ui) + bi) # f1 = npsigmoid(np.dot(x1, Wf) + np.dot(h0, Uf) + bf) # o1 = npsigmoid(np.dot(x1, Wo) + np.dot(h0, Uo) + bo) # c1 = f1 * c0 + i1 * np.tanh(np.dot(x1, Wc) + np.dot(h0, Uc) + bc) # h1 = o1 * c1 # # i2 = npsigmoid(np.dot(x2, Wi) + np.dot(h1, Ui) + bi) # f2 = npsigmoid(np.dot(x2, Wf) + np.dot(h1, Uf) + bf) # o2 = npsigmoid(np.dot(x2, Wo) + np.dot(h1, Uo) + bo) # c2 = f2 * c1 + i2 * np.tanh(np.dot(x2, Wc) + np.dot(h1, Uc) + bc) # h3 = o2 * c2 # bp = 1 # # h1, c1 = onestep(cat0, h_dummy, c_dummy, net.content['lstm'].W['i'], net.content['lstm'].W['f'], # net.content['lstm'].W['c'], net.content['lstm'].W['o'], # net.content['lstm'].U['i'], net.content['lstm'].U['f'], net.content['lstm'].U['c'], # net.content['lstm'].U['o'], # net.content['lstm'].b['i'], net.content['lstm'].b['f'], net.content['lstm'].b['c'], # net.content['lstm'].b['o']) # # h1 = h1.eval() # c1 = c1.eval() # # h2, c2 = onestep(cat1, h1, c1, net.content['lstm'].W['i'], net.content['lstm'].W['f'], # net.content['lstm'].W['c'], net.content['lstm'].W['o'], # net.content['lstm'].U['i'], net.content['lstm'].U['f'], net.content['lstm'].U['c'], # net.content['lstm'].U['o'], # net.content['lstm'].b['i'], net.content['lstm'].b['f'], net.content['lstm'].b['c'], # net.content['lstm'].b['o']) # # h2 = h2.eval() # c2 = c2.eval() # # h3, c3 = onestep(cat2, h2, c2, net.content['lstm'].W['i'], net.content['lstm'].W['f'], # net.content['lstm'].W['c'], net.content['lstm'].W['o'], # net.content['lstm'].U['i'], net.content['lstm'].U['f'], net.content['lstm'].U['c'], # net.content['lstm'].U['o'], # net.content['lstm'].b['i'], net.content['lstm'].b['f'], net.content['lstm'].b['c'], # net.content['lstm'].b['o']) # # h3 = h3.eval() # c3 = c3.eval() # # lstm = net.content['lstm'].output.eval({net.input[0]:X, net.input[1]:Y, # net.content['lstm'].h_m1_sym: h_dummy2, # net.content['lstm'].c_m1_sym: c_dummy2}) # Remove the first 'word' because it was just image priorcat knowledge, has nothing to do with the actual sentence net.content['lstm_r'] = LSTMRemove(net, net.content['lstm'], 1) #a = net.content['lstm_r'].output.eval({net.input[1]: train_Y[0:5,0:-1,:,:].eval(), # net.input[0]: train_X[0:5,:,:,:].eval(), # net.content['lstm'].h_m1_sym: h_dummy5, # net.content['lstm'].c_m1_sym: c_dummy5 # }) #print('lstm_r shape:') #print(a.shape) net.layer_opts['softmax_norm_dim'] = 2 net.content['softmax'] = SoftmaxLayer(net, net.content['lstm_r']) net.content['cost'] = CategoricalCrossEntropy(net, net.content['softmax']) net.InitLR(0.2) trainer.InitParams(net) train_update_rule = trainer.InitUpdateRule(net) additional_output = ['input_sen', 'lstm_r', 'softmax'] net.InitTrainFunction(train_update_rule, [train_X, train_Y[:, :-1, :, :]], train_Y[:, 1:, :, :], additional_output, train_weight) net.InitValFunction([val_X, val_Y[:, :-1, :, :]], val_Y[:, 1:, :, :], additional_output, val_weight) e = 0 else: snapshot_list = sorted(snapshot_list) print('Loading latest snapshot at %s' % snapshot_list[-1]) net, trainer, e = LoadList(snapshot_list[-1]) trainer.opts['save_freq'] = 10 print('Finished loading snapshot') train_X, train_Y, train_weight, val_X, val_Y, val_weight = CreateData( n_word) net.net_opts['l1_learning_rate'] = np.asarray(0.00008, theano.config.floatX) net.reset_opts['min_lr'] = np.asarray(0.00008, dtype=theano.config.floatX) net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate'] net.InitLR(1000) trainer.InitParams(net) train_update_rule = trainer.InitUpdateRule(net) additional_output = ['input_sen', 'lstm_r', 'softmax'] net.InitTrainFunction(train_update_rule, [train_X, train_Y[:, :-1, :, :]], train_Y[:, 1:, :, :], additional_output, train_weight) net.InitValFunction([val_X, val_Y[:, :-1, :, :]], val_Y[:, 1:, :, :], additional_output, val_weight) main_loop = SGDRMainLoop(net, trained_path) main_loop.run(net, trainer, e)
def train(): # theano.config.optimizer='fast_compile' trainer = Trainer() # Setting training params trainer.opts['batch_size'] = 100 trainer.opts['save'] = True trainer.opts['save_freq'] = 100 trainer.opts['num_sample'] = 300000 trainer.opts['num_epoch'] = 5000 trainer.opts['train_sentence_length'] = 11 trainer.opts['test_setence_length'] = 15 trainer.opts['num_val_sample'] = 1 trainer.opts['num_test_sample'] = 1 # Generate data num_class = 16 np.random.seed(13111991) x_dim = 32 train_X, valid_X, test_X, train_Y, valid_Y, test_Y = CreateData( x_dim, num_class, trainer) # Create a CNN for debugging by fixing a set of real input # net = ConvNeuralNet(train_X[1:16,:,:,:].eval()) # Create a CNN net = ShowTellNet() net.name = 'lstm_test' trained_path = '../../data/trained_model/' #trained_path = '/home/kien/data/trained_model/' snapshot_list = glob.glob(trained_path + net.name + '*.dat') e = -1 if (len(snapshot_list) == 0): net.net_opts['l1_learning_rate'] = np.asarray( 0.0001, dtype=theano.config.floatX) net.reset_opts['min_lr'] = np.asarray(0.00001, dtype=theano.config.floatX) net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate'] net.layer_opts['num_fc_node'] = 32 net.content['img_emb'] = FCLayer( net, net.content['input_img'], (1, trainer.opts['train_sentence_length'], x_dim, 1)) net.content['img_emb_swap'] = SwapDim(net, net.content['img_emb'], 1, 2) # Construct the network net.layer_opts['num_emb'] = 32 net.content['word_emb'] = WordEmbLayer( net, net.content['input_sen'], (trainer.opts['batch_size'], trainer.opts['train_sentence_length'] - 1, num_class, 1)) net.content['cat'] = Concat(net, net.content['img_emb_swap'], net.content['word_emb'], 1) net.layer_opts['num_lstm_node'] = num_class net.content['lstm'] = LSTM(net, net.content['cat'], (trainer.opts['batch_size'], trainer.opts['train_sentence_length'] - 1, net.layer_opts['num_emb'], 1)) net.content['lstm_r'] = LSTMRemove(net, net.content['lstm'], 0) #################### DEBUG ####################### # X = np.reshape(train_X[0:2, :, :, :].eval(), (2, 10, x_dim, 1)) # Y = np.reshape(train_Y[0:2, :, :, :].eval(), (2, 10, num_class, 1)) # h_dummy = np.zeros((1, 1, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX) # c_dummy = np.zeros((1, 1, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX) h_dummy5 = np.zeros((1, 5, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX) c_dummy5 = np.zeros((1, 5, net.layer_opts['num_lstm_node']), dtype=theano.config.floatX) # cat = net.content['cat'].output.eval({net.input[0]:X , net.input[1]: Y}) # cat = np.reshape(cat, (2, 11, x_dim)) # cat0 = np.reshape(cat[1,0,:], (1,1,x_dim)) # cat1 = np.reshape(cat[1,1,:], (1,1,x_dim)) # cat2 = np.reshape(cat[1,2,:], (1,1,x_dim)) # # x0 = cat[0,0,:].reshape(1,1,x_dim) # x1 = cat[0,1,:].reshape(1,1,x_dim) # x2 = cat[0,2,:].reshape(1,1,x_dim) # x3 = cat[0,3,:].reshape(1,1,x_dim) # Wi = net.content['lstm'].W['i'].eval() # Wf = net.content['lstm'].W['f'].eval() # Wc = net.content['lstm'].W['c'].eval() # Wo = net.content['lstm'].W['o'].eval() # # Ui = net.content['lstm'].U['i'].eval() # Uf = net.content['lstm'].U['f'].eval() # Uc = net.content['lstm'].U['c'].eval() # Uo = net.content['lstm'].U['o'].eval() # # bi = net.content['lstm'].b['i'].eval() # bf = net.content['lstm'].b['f'].eval() # bc = net.content['lstm'].b['c'].eval() # bo = net.content['lstm'].b['o'].eval() # # hm1 = h_dummy # cm1 = c_dummy # # # First iteration # i0 = npsigmoid(np.dot(x0, Wi) + np.dot(hm1, Ui) + bi) # f0 = npsigmoid(np.dot(x0, Wf) + np.dot(hm1, Uf) + bf) # o0 = npsigmoid(np.dot(x0, Wo) + np.dot(hm1, Uo) + bo) # c0 = f0*cm1 + i0*np.tanh(np.dot(x0, Wc) + np.dot(hm1, Uc) + bc) # h0 = o0*c0 # # # 2nd iteration # i1 = npsigmoid(np.dot(x1, Wi) + np.dot(h0, Ui) + bi) # f1 = npsigmoid(np.dot(x1, Wf) + np.dot(h0, Uf) + bf) # o1 = npsigmoid(np.dot(x1, Wo) + np.dot(h0, Uo) + bo) # c1 = f1 * c0 + i1 * np.tanh(np.dot(x1, Wc) + np.dot(h0, Uc) + bc) # h1 = o1 * c1 # # # 3rd iteration # i2 = npsigmoid(np.dot(x2, Wi) + np.dot(h1, Ui) + bi) # f2 = npsigmoid(np.dot(x2, Wf) + np.dot(h1, Uf) + bf) # o2 = npsigmoid(np.dot(x2, Wo) + np.dot(h1, Uo) + bo) # c2 = f2 * c1 + i2 * np.tanh(np.dot(x2, Wc) + np.dot(h1, Uc) + bc) # h2 = o2 * c2 # # # 4th iteration # i3 = npsigmoid(np.dot(x3, Wi) + np.dot(h2, Ui) + bi) # f3 = npsigmoid(np.dot(x3, Wf) + np.dot(h2, Uf) + bf) # o3 = npsigmoid(np.dot(x3, Wo) + np.dot(h2, Uo) + bo) # c3 = f3 * c2 + i3 * np.tanh(np.dot(x3, Wc) + np.dot(h2, Uc) + bc) # h3 = o3 * c3 # bp = 1 # # # lstm = net.content['lstm'].output.eval({net.input[0]:X, net.input[1]:Y, # net.content['lstm'].h_m1_sym: h_dummy2, # net.content['lstm'].c_m1_sym: c_dummy2}) ####################END DEBUG##################### net.layer_opts['softmax_norm_dim'] = 2 net.content['softmax'] = SoftmaxLayer(net, net.content['lstm_r']) net.content['cost'] = CategoricalCrossEntropy(net, net.content['softmax']) # net.simpleprint() net.InitLR(0.01) # Create params list, grad list, momentum list for the theano function to update trainer.InitParams(net) # Update rule train_update_rule = trainer.InitUpdateRule(net) additional_output = ['input_img', 'word_emb', 'softmax'] # Clip train_Y before net.InitTrainFunction(train_update_rule, [train_X, train_Y[:, :-1, :, :]], train_Y[:, 1:, :, :], additional_output) net.InitValFunction([valid_X, valid_Y[:, :-1, :, :]], valid_Y[:, 1:, :, :], additional_output) else: snapshot_list = sorted(snapshot_list) print('Loading latest snapshot at %s' % snapshot_list[-1]) net, trainer, e = LoadList(snapshot_list[-1]) # trainer = Trainer() # Setting training params # trainer.opts['batch_size'] = 100 # trainer.opts['save'] = True # trainer.opts['save_freq'] = 50 # trainer.opts['num_sample'] = 1000 # trainer.opts['num_epoch'] = 5000 # trainer.opts['train_sentence_length'] = 10 # trainer.opts['test_setence_length'] = 15 # trainer.opts['num_val_sample'] = 1 # trainer.opts['num_test_sample'] = 1 # # net.net_opts['l1_learning_rate'] = np.asarray( 0.0001, dtype=theano.config.floatX) net.reset_opts['min_lr'] = np.asarray(0.00001, dtype=theano.config.floatX) net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate'] net.InitLR(100) trainer.InitParams(net) # Create params list, grad list, momentum list for the theano function to update train_update_rule = trainer.InitUpdateRule(net) additional_output = ['input_img', 'word_emb', 'softmax'] ########################### # net = ShowTellNet() # net.name = 'lstm_test' # # net.net_opts['l1_learning_rate'] = np.asarray(0.0001, dtype=theano.config.floatX) # net.reset_opts['min_lr'] = np.asarray(0.00001, dtype=theano.config.floatX) # net.reset_opts['max_lr'] = net.net_opts['l1_learning_rate'] # # net.layer_opts['num_fc_node'] = 16 # net.content['img_emb'] = FCLayer(net, net.content['input_img'], (1, 10, x_dim, 1)) # net.content['img_emb_swap'] = SwapDim(net, net.content['img_emb'], 1, 2) # # Construct the network # # net.layer_opts['num_emb'] = 16 # net.content['word_emb'] = WordEmbLayer(net, net.content['input_sen'], # (trainer.opts['batch_size'], trainer.opts['train_sentence_length'], # num_class, 1)) # # net.content['cat'] = Concat(net, net.content['img_emb_swap'], net.content['word_emb'], 1) # # net.layer_opts['num_lstm_node'] = num_class # net.content['lstm'] = LSTM(net, net.content['cat'], # (trainer.opts['batch_size'], trainer.opts['train_sentence_length'], # net.layer_opts['num_emb'], 1)) # # net.content['lstm_r'] = LSTMRemove(net, net.content['lstm'], 0, 1) # # net.layer_opts['softmax_norm_dim'] = 2 # net.content['softmax'] = SoftmaxLayer(net, net.content['lstm_r']) # # net.content['cost'] = CategoricalCrossEntropy(net, net.content['softmax']) # net.InitLR(100) # trainer.InitParams(net) # train_update_rule = trainer.InitUpdateRule(net) # additional_output = ['input_img', 'word_emb', 'softmax'] #######################3 # Create params list, grad list, momentum list for the theano function to update # net.train_function = theano.function( # [net.index], # outputs=[net.content['cost'].output] + [net.output[0][net.index, :, :, :]], # updates=None, # givens={ # net.input[0]: train_X[net.index, :, :, :], # net.input[1]: train_Y[net.index, :, :, :], # net.output[0]: train_X[net.index, :, :, :], # net.content['lstm'].h_m1_sym: T.zeros((1, net.index.shape[0], net.content['lstm'].W_shape[1]), # dtype=theano.config.floatX), # net.content['lstm'].c_m1_sym: T.zeros((1, net.index.shape[0], net.content['lstm'].W_shape[1]), # dtype=theano.config.floatX) # # } # # ) net.InitTrainFunction(train_update_rule, [train_X, train_Y], train_Y, additional_output) net.InitValFunction([valid_X, valid_Y], valid_Y, additional_output) main_loop = SGDRMainLoop(net, trained_path) main_loop.run(net, trainer, e) a = 2