def train_loop_fn(data_loader, model, optimizer, scheduler=None): model.train() device = config.DEVICE losses = [] for batch_idx, data in enumerate(data_loader): ids = data['ids'] mask = data['mask'] token_type_ids = data['token_type_ids'] targets = data['targets'] if device: ids = ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) targets = targets.to(device, dtype=torch.float) optimizer.zero_grad() outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids) loss = loss_fn(outputs, targets) loss.backward() optimizer.step() batch_loss = loss.item() losses.append(batch_loss) if scheduler: scheduler.step() return np.mean(losses)
def prepare_model(self): self.rnn_x = tf.placeholder(tf.int32, [None, None], name='input') self.rnn_y = tf.placeholder(tf.int64, [None, self.num_items], name='output') self.mask = tf.placeholder(tf.float32, [None, None], name='mask') self.keep_prob_input = tf.placeholder(tf.float32, name='keep_prob_input') self.keep_prob_ho = tf.placeholder(tf.float32, name='keep_prob_ho') self.batch_var_length = tf.placeholder(tf.int32, name="variable_length") Wemb = tf.get_variable('Wemb', [self.num_items, self.embedding_size], initializer=self.embed_init) W_output = tf.get_variable('W_output', [1 * self.rnn_hidden_size, self.num_items], initializer=self.weight_init) b_output = tf.get_variable('b_output', [1, self.num_items], initializer=self.bias_init) emb = tf.nn.embedding_lookup(Wemb, self.rnn_x) emb = tf.nn.dropout(emb, self.keep_prob_input) custom_cell = tf.contrib.rnn.GRUCell(num_units=self.rnn_hidden_size) outputs, states = tf.nn.dynamic_rnn(custom_cell, emb, sequence_length=self.batch_var_length,dtype=tf.float32) self.outputs = outputs self.last_hidden = states # 512 x 100 # num_items x 2*100 if self.loss_type == "TOP1": proj = tf.nn.dropout(self.last_hidden, self.keep_prob_ho) pred = tf.matmul(proj, W_output) + b_output self.pred = tf.nn.tanh(pred) self.cost = loss_fn(self.rnn_y, self.pred, self.loss_type) elif self.loss_type == "CE": proj = tf.nn.dropout(self.last_hidden, self.keep_prob_ho) pred = tf.matmul(proj, W_output) + b_output self.pred = tf.nn.softmax(pred) self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=self.rnn_y)) self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.cost)
def main(argv=None): print("Running on {}".format(device)) parser = argparse.ArgumentParser( description="Train a transformer for a copy task" ) add_optimizer_arguments(parser) add_transformer_arguments(parser) add_auxiliary_arguments(parser) args = parser.parse_args(argv) print("args:\n-----\n", args) data_points = [] data_points_acc = [] n_of_each_model = 10 n_trials = 8 for model_type in [ 'transformer','lstm','rnn',]: #add back transformers and rnn for max_trained_depth in range(1, 12): for ii in range(n_of_each_model): print(f'dep{max_trained_depth}_ii_{ii}') if model_type == "transformer": d_model = 16 model = SequencePredictorRecurrentTransformer( d_model=d_model, n_classes=5, sequence_length=args.sequence_length, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=d_model, # used to be d_query dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, ) else: d_model = 8 model = SequencePredictorRNN( d_model=d_model, n_classes=5, n_layers=args.n_layers, dropout=args.dropout, rnn_type=model_type ) print(f"Created model:\n{model}") model.to(device) model.load_state_dict(torch.load(f"models_from_colab/agreement_models/model_{model_type}_depth_{max_trained_depth}_num_{ii}.zip", map_location=device)['model_state']) for test_depth in range(1, 21): # was 1, 32 stack_size = test_depth # Change this value to test longer / shorter sequences n_correct = 0 for i_trial in range(n_trials): x, y, m = SubjectVerbAgreement.get_seq(stack_size) model.eval() yhat = model(x.unsqueeze(1)) loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1)) n_correct += acc data_points.append({'model_type': model_type, 'max_trained_depth': max_trained_depth, 'test_depth': test_depth, 'accuracy': n_correct / n_trials}) print("data points") print(data_points) with open("data_points_pr_acc_r.txt", "wb") as fp: pickle.dump(data_points, fp) """
def eval_loop_fn(data_loader, model): model.eval() fin_targets = [] fin_outputs = [] losses = [] device = config.DEVICE for batch_idx, data in enumerate(data_loader): ids = data['ids'] mask = data['mask'] token_type_ids = data['token_type_ids'] targets = data['targets'] if device: ids = ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) targets = targets.to(device, dtype=torch.float) outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids) loss = loss_fn(outputs, targets) losses.append(loss.item()) fin_targets.append(targets.cpu().detach().numpy()) fin_outputs.append(outputs.cpu().detach().numpy()) return np.vstack(fin_outputs), np.vstack(fin_targets), np.mean(losses)
def train(model, iterator, optimizer): model.train() running_loss = 0.0 running_acc = 0.0 running_f1 = 0.0 for words, labels, lens in iterator: words, labels = words.to(device), labels.to(device) optimizer.zero_grad() pred = model(words.long(), hidden) loss = loss_fn(pred, labels) #compute the binary accuracy (acc, f1) = accuracy(pred, labels) #backpropage the loss and compute the gradients loss.backward() #update the weights optimizer.step() running_loss += loss.item() running_acc += acc running_f1 += f1 return running_loss / len(iterator), running_acc / len( iterator), running_f1 / len(iterator)
def forward(self, ids, attention_mask, type_ids=None, label=None): output = self.bert(ids, attention_mask) output = self.dropout(output[1]) output = self.l0(output) # output = torch.sigmoid(output) if label is not None: return loss_fn(output, label) else: return output
def train(dataset, data_loader, model, optimizer): model.train() final_loss = 0 counter = 0 final_outputs = [] final_targets = [] for bi, d in tqdm(enumerate(data_loader), total=int(len(dataset) / data_loader.batch_size)): counter = counter + 1 image = d["image"] grapheme_root = d["grapheme_root"] vowel_diacritic = d["vowel_diacritic"] consonant_diacritic = d["consonant_diacritic"] DEVICE = "cuda" image = image.to(DEVICE, dtype=torch.float) grapheme_root = grapheme_root.to(DEVICE, dtype=torch.long) vowel_diacritic = vowel_diacritic.to(DEVICE, dtype=torch.long) consonant_diacritic = consonant_diacritic.to(DEVICE, dtype=torch.long) optimizer.zero_grad() outputs = model(image) targets = (grapheme_root, vowel_diacritic, consonant_diacritic) loss = loss_fn(outputs, targets) loss.backward() optimizer.step() final_loss += loss o1, o2, o3 = outputs t1, t2, t3 = targets final_outputs.append(torch.cat((o1, o2, o3), dim=1)) final_targets.append(torch.stack((t1, t2, t3), dim=1)) #if bi % 10 == 0: # break final_outputs = torch.cat(final_outputs) final_targets = torch.cat(final_targets) print("=================Train=================") macro_recall_score = macro_recall(final_outputs, final_targets) return final_loss / counter, macro_recall_score
def test(model, iterator): running_loss = 0.0 running_acc = 0.0 running_f1 = 0.0 with torch.no_grad(): for words, labels, lens in iterator: words, labels = words.to(device), labels.to(device) pred = model(words.long(), hidden) loss = loss_fn(pred, labels) #compute the binary accuracy (acc, f1) = accuracy(pred, labels) running_loss += loss.item() running_acc += acc running_f1 += f1 return running_loss / len(iterator), running_acc / len( iterator), running_f1 / len(iterator)
def evaluate(dataset, data_loader, model): with torch.no_grad(): model.eval() final_loss = 0 counter = 0 final_outputs = [] final_targets = [] for bi, d in tqdm(enumerate(data_loader), total=int(len(dataset) / data_loader.batch_size)): counter = counter + 1 image = d["image"] grapheme_root = d["grapheme_root"] vowel_diacritic = d["vowel_diacritic"] consonant_diacritic = d["consonant_diacritic"] DEVICE = "cuda" image = image.to(DEVICE, dtype=torch.float) grapheme_root = grapheme_root.to(DEVICE, dtype=torch.long) vowel_diacritic = vowel_diacritic.to(DEVICE, dtype=torch.long) consonant_diacritic = consonant_diacritic.to(DEVICE, dtype=torch.long) outputs = model(image) targets = (grapheme_root, vowel_diacritic, consonant_diacritic) loss = loss_fn(outputs, targets) final_loss += loss o1, o2, o3 = outputs t1, t2, t3 = targets #print(t1.shape) final_outputs.append(torch.cat((o1, o2, o3), dim=1)) final_targets.append(torch.stack((t1, t2, t3), dim=1)) final_outputs = torch.cat(final_outputs) final_targets = torch.cat(final_targets) print("=================VALID============") macro_recall_score = macro_recall(final_outputs, final_targets) return final_loss / counter, macro_recall_score
def build(self): # p = pitch, r = rhythm self.n_p_inputs = len(self.X_tr_p[0][0]) self.n_p_outputs = len(self.y_tr_p[0][0]) self.n_r_inputs = len(self.X_tr_r[0][0]) self.n_r_outputs = len(self.y_tr_r[0][0]) self.X_p = tf.placeholder(tf.float32, [None, None, self.n_p_inputs], name="X_p") self.y_p = tf.placeholder(tf.float32, [None, None, self.n_p_outputs], name="y_p") self.X_r = tf.placeholder(tf.float32, [None, None, self.n_r_inputs], name="X_r") self.y_r = tf.placeholder(tf.float32, [None, None, self.n_r_outputs], name="Y_r") # CNN pitch network_p = tf.layers.conv1d(inputs=self.X_p, filters=12, kernel_size=8, padding='valid', activation=tf.nn.relu, name='conv_p_1') # CNN rhythm network_r = tf.layers.conv1d(inputs=self.X_r, filters=12, kernel_size=8, padding='valid', activation=tf.nn.relu, name='conv_r_1') # batch normalization parameters self.is_training = tf.placeholder(tf.bool, shape=(), name="is_training") bn_params = { "is_training": self.is_training, "decay": 0.99, "updates_collections": None, "scale": True } bn_params_out = { "is_training": self.is_training, "decay": 0.99, "updates_collections": None } if self.model_type == "combine": combined = tf.concat([network_p, network_r], axis=2) #full connected combined n_hidden_comb1 = 128 n_hidden_comb2 = 128 keep_prob = 0.7 stacked_combined = tf.reshape(combined, [-1, 24], name='stacked_outs_p') fc1 = fully_connected(stacked_combined, num_outputs=n_hidden_comb1, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='comb_1') drop1 = tf.contrib.layers.dropout(fc1, keep_prob, is_training=self.is_training) fc2 = fully_connected(drop1, num_outputs=n_hidden_comb2, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='comb_2') drop2 = tf.contrib.layers.dropout(fc1, keep_prob, is_training=self.is_training) outs_p = drop2 outs_r = drop2 elif self.model_type == "separate": # pitch n_neurons_p1 = 256 n_neurons_p2 = 256 keep_prob_p = 0.7 cnn_outs_p = tf.reshape(network_p, [-1, 12], name='cnn_outs_p') fc1_p = fully_connected(cnn_outs_p, num_outputs=n_neurons_p1, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='fc1_p1') drop_p = tf.contrib.layers.dropout(fc1_p, keep_prob_p, is_training=self.is_training) fc2_p = fully_connected(drop_p, num_outputs=n_neurons_p2, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='fc2_p1') outs_p = tf.contrib.layers.dropout(fc2_p, keep_prob_p, is_training=self.is_training) # rhythm n_neurons_r1 = 256 n_neurons_r2 = 256 keep_prob_r = 0.7 cnn_outs_r = tf.reshape(network_r, [-1, 12], name='cnn_outs_r') fc1_r = fully_connected(cnn_outs_r, num_outputs=n_neurons_r1, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='fc1_r1') drop_r = tf.contrib.layers.dropout(fc1_r, keep_prob_r, is_training=self.is_training) fc2_r = fully_connected(drop_r, num_outputs=n_neurons_r2, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='fc2_r1') outs_r = tf.contrib.layers.dropout(fc2_r, keep_prob_r, is_training=self.is_training) # fully connected pitch n_hidden_1_p = 48 n_hidden_2_p = 32 keep_prob_p = 0.6 stacked_logits_p1 = fully_connected(outs_p, num_outputs=n_hidden_1_p, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='dense_p1') p_drop1 = tf.contrib.layers.dropout(stacked_logits_p1, keep_prob_p, is_training=self.is_training) stacked_logits_p2 = fully_connected(p_drop1, num_outputs=n_hidden_2_p, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='dense_p2') p_drop2 = tf.contrib.layers.dropout(stacked_logits_p2, keep_prob_p, is_training=self.is_training) stacked_logits_p3 = fully_connected(p_drop2, num_outputs=self.n_p_outputs, activation_fn=None, normalizer_fn=batch_norm, normalizer_params=bn_params_out, scope='dense_p_out') self.logits_p = tf.reshape( stacked_logits_p3, [-1, tf.shape(self.y_p)[1], self.n_p_outputs], name='logits_p') # fully connected rhythm n_hidden_1_r = 48 n_hidden_2_r = 32 keep_prob_r = 0.6 # separate rhythm stacked_logits_r1 = fully_connected(outs_r, n_hidden_1_r, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='dense_r1') r_drop1 = tf.contrib.layers.dropout(stacked_logits_r1, keep_prob_r, is_training=self.is_training) stacked_logits_r2 = fully_connected(r_drop1, n_hidden_2_r, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='dense_r2') r_drop2 = tf.contrib.layers.dropout(stacked_logits_r2, keep_prob_r, is_training=self.is_training) stacked_logits_r3 = fully_connected(r_drop2, self.n_r_outputs, activation_fn=None, normalizer_fn=batch_norm, normalizer_params=bn_params_out, scope='dense_r_out') self.logits_r = tf.reshape( stacked_logits_r3, [-1, tf.shape(self.y_r)[1], self.n_r_outputs], name='logits_r') # loss params learn_rate = 0.02 clip = 5 # loss self.loss_r = loss_fn(self.logits_r, self.y_r) self.loss_p = loss_fn(self.logits_p, self.y_p) self.total_loss = tf.add(self.loss_r, self.loss_p) # training op if self.model_type == "combine": optimizer = tf.train.AdamOptimizer(learning_rate=learn_rate) gradients = optimizer.compute_gradients(self.total_loss) capped_gradients = [(tf.clip_by_norm(grad, clip), var) if grad != None else (grad, var) for grad, var in gradients] self.train_op = optimizer.apply_gradients(capped_gradients) elif self.model_type == "separate": optimizer = tf.train.AdamOptimizer(learning_rate=learn_rate) # rhythm gradients_r = optimizer.compute_gradients(self.loss_r) capped_gradients_r = [(tf.clip_by_norm(grad, clip), var) if grad != None else (grad, var) for grad, var in gradients_r] self.train_op_r = optimizer.apply_gradients(capped_gradients_r) # pitch gradients_p = optimizer.compute_gradients(self.loss_p) capped_gradients_p = [(tf.clip_by_norm(grad, clip), var) if grad != None else (grad, var) for grad, var in gradients_p] self.train_op_p = optimizer.apply_gradients(capped_gradients_p) # evaluation self.accuracy_r = accuracy_fn(self.logits_r, self.y_r) self.accuracy_p = accuracy_fn(self.logits_p, self.y_p) self.execute()
def build(self): # p = pitch, r = rhythm self.n_p_inputs = len(self.X_tr_p[0][0]) self.n_p_outputs = len(self.y_tr_p[0][0]) self.n_r_inputs = len(self.X_tr_r[0][0]) self.n_r_outputs = len(self.y_tr_r[0][0]) self.X_p = tf.placeholder(tf.float32, [None, None, self.n_p_inputs], name="X_p") self.y_p = tf.placeholder(tf.float32, [None, None, self.n_p_outputs], name="y_p") self.X_r = tf.placeholder(tf.float32, [None, None, self.n_r_inputs], name="X_r") self.y_r = tf.placeholder(tf.float32, [None, None, self.n_r_outputs], name="Y_r") if self.model_type == "combine": # concat pitch and rhythm combined = tf.concat([self.X_p, self.X_r], axis=2) # RNN nu_rnn = 64 cells = [] cells.append( tf.contrib.rnn.LSTMCell(nu_rnn, use_peepholes=True, activation=tf.tanh)) cells.append( tf.contrib.rnn.LSTMCell(nu_rnn, use_peepholes=True, activation=tf.tanh)) multi = tf.contrib.rnn.MultiRNNCell(cells) outs, _ = tf.nn.dynamic_rnn(multi, combined, dtype=tf.float32, swap_memory=True, scope="rhythm") outs_r = outs outs_p = outs elif self.model_type == "separate": # RNN nu_rnn = 64 # RNN pitch cells_p = [] cells_p.append( tf.contrib.rnn.LSTMCell(nu_rnn, use_peepholes=True, activation=tf.tanh)) cells_p.append( tf.contrib.rnn.LSTMCell(nu_rnn, use_peepholes=True, activation=tf.tanh)) multi_p = tf.contrib.rnn.MultiRNNCell(cells_p) outs_p, _ = tf.nn.dynamic_rnn(multi_p, self.X_p, dtype=tf.float32, swap_memory=True, scope="pitch") # RNN rhythm cells_r = [] cells_r.append( tf.contrib.rnn.LSTMCell(nu_rnn, use_peepholes=True, activation=tf.tanh)) cells_r.append( tf.contrib.rnn.LSTMCell(nu_rnn, use_peepholes=True, activation=tf.tanh)) multi_r = tf.contrib.rnn.MultiRNNCell(cells_r) outs_r, _ = tf.nn.dynamic_rnn(multi_r, self.X_r, dtype=tf.float32, swap_memory=True, scope="rhythm") # batch normalization parameters self.is_training = tf.placeholder(tf.bool, shape=(), name="is_training") bn_params = { "is_training": self.is_training, "decay": 0.999, "updates_collections": None, "scale": True } bn_params_out = { "is_training": self.is_training, "decay": 0.999, "updates_collections": None } # fully connected pitch n_hidden_1_p = 48 n_hidden_2_p = 32 keep_prob_p = 0.6 stacked_outs_p = tf.reshape(outs_p, [-1, nu_rnn], name='stacked_outs_p') stacked_logits_p1 = fully_connected(stacked_outs_p, num_outputs=n_hidden_1_p, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='dense_p1') p_drop1 = tf.contrib.layers.dropout(stacked_logits_p1, keep_prob_p, is_training=self.is_training) stacked_logits_p2 = fully_connected(p_drop1, num_outputs=n_hidden_2_p, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='dense_p2') p_drop2 = tf.contrib.layers.dropout(stacked_logits_p2, keep_prob_p, is_training=self.is_training) stacked_logits_p3 = fully_connected(p_drop2, num_outputs=self.n_p_outputs, activation_fn=None, normalizer_fn=batch_norm, normalizer_params=bn_params_out, scope='dense_p_out') self.logits_p = tf.reshape( stacked_logits_p3, [-1, tf.shape(self.y_p)[1], self.n_p_outputs], name='logits_p') # fully connected rhythm n_hidden_1_r = 48 n_hidden_2_r = 32 keep_prob_r = 0.6 # separate rhythm stacked_outs_r = tf.reshape(outs_r, [-1, nu_rnn], name='stacked_outs_r') stacked_logits_r1 = fully_connected(stacked_outs_r, n_hidden_1_r, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='dense_r1') r_drop1 = tf.contrib.layers.dropout(stacked_logits_r1, keep_prob_r, is_training=self.is_training) stacked_logits_r2 = fully_connected(r_drop1, n_hidden_2_r, activation_fn=tf.nn.elu, normalizer_fn=batch_norm, normalizer_params=bn_params, scope='dense_r2') r_drop2 = tf.contrib.layers.dropout(stacked_logits_r2, keep_prob_r, is_training=self.is_training) stacked_logits_r3 = fully_connected(r_drop2, self.n_r_outputs, activation_fn=None, normalizer_fn=batch_norm, normalizer_params=bn_params_out, scope='dense_r_out') self.logits_r = tf.reshape( stacked_logits_r3, [-1, tf.shape(self.y_r)[1], self.n_r_outputs], name='logits_r') # loss params learn_rate = 0.02 clip = 5 # loss self.loss_r = loss_fn(self.logits_r, self.y_r) self.loss_p = loss_fn(self.logits_p, self.y_p) self.total_loss = tf.add(self.loss_r, self.loss_p) # training op if self.model_type == "combine": optimizer = tf.train.AdamOptimizer(learning_rate=learn_rate) gradients = optimizer.compute_gradients(self.total_loss) capped_gradients = [(tf.clip_by_norm(grad, clip), var) if grad != None else (grad, var) for grad, var in gradients] self.train_op = optimizer.apply_gradients(capped_gradients) elif self.model_type == "separate": optimizer = tf.train.AdamOptimizer(learning_rate=learn_rate) # rhythm gradients_r = optimizer.compute_gradients(self.loss_r) capped_gradients_r = [(tf.clip_by_norm(grad, clip), var) if grad != None else (grad, var) for grad, var in gradients_r] self.train_op_r = optimizer.apply_gradients(capped_gradients_r) # pitch gradients_p = optimizer.compute_gradients(self.loss_p) capped_gradients_p = [(tf.clip_by_norm(grad, clip), var) if grad != None else (grad, var) for grad, var in gradients_p] self.train_op_p = optimizer.apply_gradients(capped_gradients_p) # evaluation self.accuracy_r = accuracy_fn(self.logits_r, self.y_r) self.accuracy_p = accuracy_fn(self.logits_p, self.y_p) self.execute()
def main(argv=None): print("Running on {}".format(device)) parser = argparse.ArgumentParser( description="Train a transformer for a copy task" ) add_optimizer_arguments(parser) add_transformer_arguments(parser) add_auxiliary_arguments(parser) args = parser.parse_args(argv) print("args:\n-----\n", args) if args.model_type == "transformer": model = SequencePredictorRecurrentTransformer( d_model=args.d_model, n_classes=5, sequence_length=args.sequence_length, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=args.d_model, # used to be d_query dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, ) else: model = SequencePredictorRNN( d_model=args.d_model, n_classes=5, n_layers=args.n_layers, dropout=args.dropout, rnn_type=args.model_type ) print(f"Created model:\n{model}") model.to(device) print("Number of epochs model was trained on: ",torch.load(args.continue_from, map_location=device)['epoch']) model.load_state_dict(torch.load(args.continue_from, map_location=device)['model_state']) def format_preds(x, y, preds, mask): n = len(x) n_dig = math.floor(math.log10(n)) + 1 nums = [] for p_dig in range(n_dig): nums.append( "# |" + "".join([str((i//10**p_dig)%10) for i in range(n)]) + "\n") nums = "".join(nums[::-1]) xs = "x |" + "".join([str(int(v)) for v in x]) + "\n" ys = "y |" + "".join([elt if mask[i] == 1 else '?' for i, elt in enumerate([str(int(v)) for v in y])]) + "\n" yh = "yh|" + "".join([elt if mask[i] == 1 else '?' for i, elt in enumerate([str(int(v)) for v in preds])]) + "\n" return nums + xs + ys + yh acc_list = [] max_acc = None for stack_size in range(1, 64): x, y, m = SubjectVerbAgreement.get_seq(stack_size) # print(x.shape, y.shape, m.shape) model.eval() yhat = model(x.unsqueeze(1)) hdn = model.hidden_state # batch x seq x hdn loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1)) acc_list.append((stack_size, acc)) if acc == 1: max_acc = stack_size print("Highest perfect score at depth:", max_acc) plot_hidden_state_2d(np.array(acc_list), pca=False) stack_size = 7 # Change this value to test longer / shorter sequences x, y, m = SubjectVerbAgreement.get_seq(stack_size) model.eval() yhat = model(x.unsqueeze(1)) hdn = model.hidden_state # batch x seq x hdn loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1)) print("Model loss: ", loss) print("Model accuracy: ", acc) print(format_preds(x, y, torch.argmax(yhat, dim=2)[0], m)) plot_hidden_state_2d(hdn[0].detach().cpu().numpy(), pca=True) """
# Iterations step = 0 best_mefssim_val = 0. torch.backends.cudnn.benchmark = True for epoch in range(num_epoch): ''' train ''' for i, img in enumerate(loader['train']): img = img.cuda() img = torch.rot90(img, int(torch.randint(4, [1])), [-1, -2]) #1. update net.train() net.zero_grad() optimizer.zero_grad() imgf = net(img) _ssim = loss_fn(imgf, img) _l1penalty = halo_fn(imgf) loss = _ssim + loss_weight * _l1penalty loss.backward() optimizer.step() loss_weight = min(loss_weight + 0.25, 10) # update loss weight #2. print information print("[%d,%d] MEFSSIM: %.4f, L1: %.4f, Loss: %.4f" % (epoch + 1, i + 1, _ssim.item(), _l1penalty.item(), loss.item())) #3. log the scalar values writer.add_scalar('loss', loss.item(), step) step += 1 ''' validation '''
def prepare_model(self): self.rnn_x = tf.placeholder(tf.int32, [None, None], name='input') self.rnn_y = tf.placeholder(tf.int64, [None, self.num_items], name='output') self.mask = tf.placeholder(tf.float32, [None, None], name='mask') self.keep_prob_input = tf.placeholder(tf.float32, name='keep_prob_input') self.keep_prob_ho = tf.placeholder(tf.float32, name='keep_prob_ho') self.batch_var_length = tf.placeholder(tf.int32, name="variable_length") Wemb = tf.get_variable('Wemb', [self.num_items, self.embedding_size], initializer=self.embed_init) W_encoder = tf.get_variable( 'W_encoder', [self.rnn_hidden_size, self.rnn_hidden_size], initializer=self.weight_init) W_decoder = tf.get_variable( 'W_decoder', [self.rnn_hidden_size, self.rnn_hidden_size], initializer=self.weight_init) Bi_vector = tf.get_variable('Bi_vector', [1, self.rnn_hidden_size], initializer=self.weight_init) if self.loss_type == 'EMB': bili = tf.get_variable( 'bili', [self.embedding_size, 2 * self.rnn_hidden_size], initializer=self.weight_init) elif self.loss_type == "Trilinear": ws = tf.get_variable('ws', [self.embedding_size, self.embedding_size], initializer=self.weight_init) bs = tf.get_variable('bs', [self.embedding_size], initializer=self.bias_init) wt = tf.get_variable('wt', [self.embedding_size, self.embedding_size], initializer=self.weight_init) bt = tf.get_variable('bt', [self.embedding_size], initializer=self.bias_init) elif self.loss_type == "TOP1": W_top1 = tf.get_variable( 'W_top1', [2 * self.rnn_hidden_size, self.num_items], initializer=self.weight_init) b_top1 = tf.get_variable('b_top1', [1, self.num_items], initializer=self.bias_init) elif self.loss_type == "TOP1_variant": bili = tf.get_variable( 'bili', [self.embedding_size, 2 * self.rnn_hidden_size], initializer=self.weight_init) W_top1 = tf.get_variable( 'W_top1', [2 * self.rnn_hidden_size, self.num_items], initializer=self.weight_init) b_top1 = tf.get_variable('b_top1', [1, self.num_items], initializer=self.bias_init) emb = tf.nn.embedding_lookup(Wemb, self.rnn_x) emb = tf.nn.dropout(emb, self.keep_prob_input) custom_cell = tf.contrib.rnn.GRUCell(num_units=self.rnn_hidden_size) outputs, states = tf.nn.dynamic_rnn( custom_cell, emb, sequence_length=self.batch_var_length, dtype=tf.float32) self.outputs = outputs self.last_hidden = states # 512 x 100 outputs = tf.transpose(outputs, perm=[1, 0, 2]) # 19x512x100 squares = tf.map_fn(lambda x: compute_alpha( x, self.last_hidden, W_encoder, W_decoder, Bi_vector), outputs) # 19x512 weight = tf.nn.softmax(tf.transpose(squares) + 100000000. * (self.mask - 1), axis=1) # batch_size * max_len attention_proj = tf.reduce_sum(outputs * tf.transpose(weight)[:, :, None], axis=0) # num_items x 2*100 if self.loss_type == 'EMB': proj = tf.concat([attention_proj, states], 1) proj = tf.nn.dropout(proj, self.keep_prob_ho) ytem = tf.matmul(Wemb, bili) pred = tf.matmul(proj, tf.transpose(ytem)) self.pred = tf.nn.softmax(pred) self.cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=self.rnn_y)) elif self.loss_type == "Trilinear": hs = tf.nn.tanh(tf.matmul(attention_proj, ws) + bs) # batch * hidden ht = tf.nn.tanh(tf.matmul(states, wt) + bt) # batch * hidden pred = tf.nn.sigmoid( tf.matmul(tf.multiply(ht, hs), tf.transpose(Wemb))) # batch * n_item self.pred = tf.nn.softmax(pred) self.cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=self.rnn_y)) elif self.loss_type == "TOP1": proj = tf.concat([attention_proj, states], 1) proj = tf.nn.dropout(proj, self.keep_prob_ho) pred = tf.matmul(proj, W_top1) + b_top1 self.pred = tf.nn.tanh(pred) self.cost = loss_fn(self.rnn_y, self.pred, self.loss_type) elif self.loss_type == "TOP1_variant": proj = tf.concat([attention_proj, states], 1) proj = tf.nn.dropout(proj, self.keep_prob_ho) ytem = tf.matmul(Wemb, bili) pred = tf.matmul(proj, tf.transpose(ytem)) self.pred = tf.nn.tanh(pred) self.cost = loss_fn(self.rnn_y, self.pred, self.loss_type) self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.cost)
def prepare_model(self): self.rnn_x1 = tf.placeholder(tf.int32, [None, self.maxlen], name='input1') self.rnn_x2 = tf.placeholder(tf.int32, [None, 1], name='input2') self.rnn_y = tf.placeholder(tf.int64, [None, self.num_items], name='output') self.mask_x1 = tf.placeholder(tf.float32, [None, self.maxlen], name='mask_x1') # batch_size * maxlen self.mask_x2 = tf.placeholder(tf.float32, [None, 1], name='mask_x2') self.keep_prob_input = tf.placeholder(tf.float32, name='keep_prob_input') self.keep_prob_ho = tf.placeholder(tf.float32, name='keep_prob_ho') self.batch_var_length = tf.placeholder(tf.float32, name="variable_length") Wemb = tf.get_variable('Wemb', [self.num_items, self.embedding_size], initializer=self.embed_init) w0 = tf.get_variable('w0', [self.embedding_size, 1], initializer=self.weight_init) w1 = tf.get_variable('w1', [self.embedding_size, self.embedding_size], initializer=self.weight_init) w2 = tf.get_variable('w2', [self.embedding_size, self.embedding_size], initializer=self.weight_init) w3 = tf.get_variable('w3', [self.embedding_size, self.embedding_size], initializer=self.weight_init) ba = tf.get_variable('ba', [self.embedding_size], initializer=self.bias_init) if self.loss_type == 'EMB': bili = tf.get_variable( 'bili', [self.embedding_size, 2 * self.rnn_hidden_size], initializer=self.weight_init) elif self.loss_type == "Trilinear": ws = tf.get_variable('ws', [self.embedding_size, self.embedding_size], initializer=self.weight_init) bs = tf.get_variable('bs', [self.embedding_size], initializer=self.bias_init) wt = tf.get_variable('wt', [self.embedding_size, self.embedding_size], initializer=self.weight_init) bt = tf.get_variable('bt', [self.embedding_size], initializer=self.bias_init) elif self.loss_type == "TOP1": W_top1 = tf.get_variable( 'W_top1', [2 * self.rnn_hidden_size, self.num_items], initializer=self.weight_init) b_top1 = tf.get_variable('b_top1', [1, self.num_items], initializer=self.bias_init) elif self.loss_type == "TOP1_variant": bili = tf.get_variable( 'bili', [self.embedding_size, 2 * self.rnn_hidden_size], initializer=self.weight_init) W_top1 = tf.get_variable( 'W_top1', [2 * self.rnn_hidden_size, self.num_items], initializer=self.weight_init) b_top1 = tf.get_variable('b_top1', [1, self.num_items], initializer=self.bias_init) emb_x1 = tf.nn.embedding_lookup( Wemb, self.rnn_x1) # xi (batch_size * maxlen * num_hidden) emb_x2 = tf.squeeze(tf.nn.embedding_lookup(Wemb, self.rnn_x2), axis=1) # xt (batch_size * num_hidden) tiled_mask = tf.tile(tf.expand_dims(self.mask_x1, 2), [1, 1, self.rnn_hidden_size ]) # xt (batch_size * maxlen * num_hidden) ms = tf.reduce_sum(tf.multiply(emb_x1, tiled_mask), axis=1) # batch_size * num_hidden tiled_var_length = tf.tile( tf.reshape(self.batch_var_length, [-1, 1]), [1, self.rnn_hidden_size]) # (batch_size * num_hidden) ms = tf.reshape(tf.div(ms, tiled_var_length), [-1, self.rnn_hidden_size]) # batch_size * num_hidden outputs1 = tf.transpose(emb_x1, perm=[1, 0, 2]) # maxlen * batch_size * num_hidden unnormalized_alpha = tf.map_fn( lambda x: compute_alpha_STAMP(x, emb_x2, ms, w0, w1, w2, w3, ba), outputs1) # maxlen * batch_size unnormalized_alpha = tf.multiply(tf.transpose(unnormalized_alpha), self.mask_x1) # batch_size * maxlen self.unnormalized_alpha = unnormalized_alpha alpha = unnormalized_alpha # batch_size * maxlen #alpha = tf.nn.softmax(unnormalized_alpha + 100000000. * (self.mask_x1 - 1), dim=1) # batch_size * max_len self.alpha = alpha tiled_alpha = tf.tile( tf.expand_dims(alpha, axis=2), [1, 1, self.rnn_hidden_size]) # batch_size * maxlen * hidden_size self.tiled_alpha = tiled_alpha ma = tf.reduce_sum(tf.multiply(emb_x1, tiled_alpha), axis=1) # batch * hidden hs = tf.nn.tanh(tf.matmul(ma, ws) + bs) # batch * hidden ht = tf.nn.tanh(tf.matmul(emb_x2, wt) + bt) # batch * hidden if self.loss_type == 'EMB': proj = tf.concat([hs, ht], 1) proj = tf.nn.dropout(proj, self.keep_prob_ho) ytem = tf.matmul(Wemb, bili) pred = tf.matmul(proj, tf.transpose(ytem)) self.pred = tf.nn.softmax(pred) self.cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=self.rnn_y)) elif self.loss_type == "Trilinear": pred = tf.nn.sigmoid( tf.matmul(tf.multiply(ht, hs), tf.transpose(Wemb))) # batch * n_item self.pred = tf.nn.softmax(pred) self.cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=pred, labels=self.rnn_y)) elif self.loss_type == "TOP1": proj = tf.concat([hs, ht], 1) proj = tf.nn.dropout(proj, self.keep_prob_ho) pred = tf.matmul(proj, W_top1) + b_top1 self.pred = tf.nn.tanh(pred) self.cost = loss_fn(self.rnn_y, self.pred, self.loss_type) elif self.loss_type == "TOP1_variant": pred = tf.nn.sigmoid( tf.matmul(tf.multiply(ht, hs), tf.transpose(Wemb))) # batch * n_item self.pred = tf.nn.tanh(pred) self.cost = loss_fn(self.rnn_y, self.pred, self.loss_type) self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.cost)
def validate(ldr_dir, hdr_dir, gen_dir, logs_dir, img_height, img_width): X = tf.placeholder(tf.float32, [1, img_height, img_width, 3]) Y = tf.placeholder(tf.float32, [1, img_height, img_width, 3]) valid_ldr_path, _ = get_filepath(ldr_dir, '.png') valid_hdr_path, valid_hdr_name = get_filepath(hdr_dir, '.hdr') num_valid = len(valid_hdr_path) # Data loader dataset = tf.data.Dataset.from_tensor_slices( (valid_ldr_path, valid_hdr_path)) dataset = dataset.map(valid_parse, num_parallel_calls=4) dataset = dataset.batch(1) iter = dataset.make_one_shot_iterator() ldr_img, hdr_img, Hth = iter.get_next() alpha = alpha_msk(X) # Prediction with tf.name_scope('HDR_CNN'): hdr_nn = hdrcnn(X, is_training=False, reuse=False) hdr_final = get_final_hdr(X, hdr_nn) # Loss functions with tf.name_scope('Loss'): irloss, dirloss = loss_fn(X, hdr_nn, Y) saver = tf.train.Saver(tf.global_variables()) with tf.Session() as sess: # Initialize variables sess.run(tf.global_variables_initializer()) # Restore weights of model saver.restore(sess, model_dir) # Validation log = "\n========== Validation Begin ==========\n" write_logs(logs_dir, log, True) valid_start = time.time() avg_irloss = 0 avg_dirloss = 0 for f in valid_hdr_name: valid_img_start = time.time() ldr_image, hdr_image, Hth_val = sess.run([ldr_img, hdr_img, Hth]) alpha_val, hdr_pred, irloss_val, dirloss_val = sess.run( [alpha, hdr_final, irloss, dirloss], feed_dict={ X: ldr_image, Y: hdr_image }) avg_irloss += irloss_val avg_dirloss += dirloss_val f1, _ = f.split("_") img_write(gen_dir, 'alpha_' + f1 + '_HDR.png', alpha_val, 'PNG-FI') # Gamma correction hdr_pred_save = np.multiply(Hth_val, np.maximum(hdr_pred, 0.0)) img_write(gen_dir, 'pred_' + f, hdr_pred_save, 'HDR-FI') # Tone mapping hdr_pred_gamma = np.power(np.maximum(hdr_pred, 0.0), gamma) ldr_tone = reinhard02(hdr_pred_gamma, a=0.18) img_write(gen_dir, 'tm_' + f1 + '_HDR.png', ldr_tone, 'PNG-FI') log = "Image {}, Time {:2.5f}, Shape = {}, I/R Loss = {:2.5f}, Direct Loss = {:2.5f}".format( f, time.time() - valid_img_start, hdr_pred.shape, irloss_val, dirloss_val) write_logs(logs_dir, log, False) log = "\nAverage I/R Loss = {:2.5f}, Average Direct Loss = {:2.5f}".format( avg_irloss / num_valid, avg_dirloss / num_valid) write_logs(logs_dir, log, False) log = "\nValidation Time: {:2.5f}".format(time.time() - valid_start) write_logs(logs_dir, log, False) log = "\n========== Validation End ==========\n" write_logs(logs_dir, log, False) sess.close()
if __name__ == "__main__": EMBEDDING_DIM = 100 HIDDEN_DIM = 100 BATCH_SIZE = 256 vocab = build_vocab('data') word_vocab, label_vocab = vocab train_dataset = NERDataset('data', vocab, type='/train') train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=2, collate_fn=custom_collate, shuffle=True) sample_data, sample_target, sample_len = next(iter(train_loader)) sample_data = sample_data.long() model = RNN(EMBEDDING_DIM, HIDDEN_DIM, len(word_vocab), len(label_vocab)) hidden = model.init_hidden(BATCH_SIZE) with torch.no_grad(): tag_scores = model(sample_data, hidden) print(tag_scores.shape) loss = loss_fn(tag_scores, sample_target) print(loss.item()) acc, f1 = accuracy(tag_scores, sample_target) print(acc, f1)
def main(argv=None): print("Running on {}".format(device)) parser = argparse.ArgumentParser( description="Train a transformer for a copy task") add_optimizer_arguments(parser) add_transformer_arguments(parser) add_auxiliary_arguments(parser) args = parser.parse_args(argv) print("args:\n-----\n", args) data_points = [] for model_type in ['rnn', 'lstm', 'transformer']: for max_trained_depth in range(1, 12): for test_depth in range(1, 21): for ii in range(10): if model_type == "transformer": model = SequencePredictorRecurrentTransformer( d_model=16, n_classes=5, sequence_length=args.sequence_length, attention_type=args.attention_type, n_layers=args.n_layers, n_heads=args.n_heads, d_query=8, # used to be d_query dropout=args.dropout, softmax_temp=None, attention_dropout=args.attention_dropout, ) else: model = SequencePredictorRNN( d_model=8 if model_type == 'lstm' else 8, n_classes=5, n_layers=args.n_layers, dropout=args.dropout, rnn_type=model_type) print(f"Created model:\n{model}") model.to(device) model_name = "models_from_colab/agreement_models/model_" + model_type + "_depth_" + str( max_trained_depth) + "_num_" + str(ii) + ".zip" model.load_state_dict( torch.load(model_name, map_location=device)['model_state']) stack_size = test_depth x, y, m = SubjectVerbAgreement.get_seq(stack_size) model.eval() yhat = model(x.unsqueeze(1)) hdn = model.hidden_state # batch x seq x hdn loss, acc = loss_fn(y.unsqueeze(1), yhat, m.unsqueeze(1)) data_points.append({ 'model_type': model_type, 'max_trained_depth': max_trained_depth, 'test_depth': test_depth, 'accuracy': acc }) print("data points:") print(data_points) with open("data_points_sva.txt", "wb") as fp: pickle.dump(data_points, fp) """
def main(): # Training settings parser = ArgumentParser() parser.add_argument('-d', '--device', default=None, type=str, help='indices of GPUs to enable (default: None)') parser.add_argument('-b', '--batch-size', type=int, default=1024, help='number of batch size for training') parser.add_argument('-e', '--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--save-path', type=str, default='result/model.pth', help='path to trained model to save') parser.add_argument('--model', choices=['MLP', 'BiLSTM', 'BiLSTMAttn', 'CNN'], default='MLP', help='model name') parser.add_argument('--env', choices=['local', 'server'], default='server', help='development environment') parser.add_argument('--word-dim', type=int, default=128, help='the dimension of embedding') parser.add_argument( '--word-lim', type=int, default=None, help='If specified, input sequence length is limited from tail.') parser.add_argument('--lr', type=float, default=1e-3, help='learning rate') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') args = parser.parse_args() torch.manual_seed(args.seed) os.makedirs(os.path.dirname(args.save_path), exist_ok=True) if args.device: os.environ["CUDA_VISIBLE_DEVICES"] = args.device device = torch.device('cuda:0' if torch.cuda.is_available() and args.device is not None else 'cpu') model_w2v = KeyedVectors.load_word2vec_format(W2V_MODEL_FILE[args.env], binary=True) word_to_id = word2id(model_w2v) initial_embedding = load_word_embedding(model_w2v) # setup data_loader instances train_data_loader = PosNegDataLoader(TRAIN_FILE[args.env], word_to_id, args.word_lim, args.batch_size, shuffle=True, num_workers=2) valid_data_loader = PosNegDataLoader(VALID_FILE[args.env], word_to_id, args.word_lim, args.batch_size, shuffle=False, num_workers=2) # build model architecture if args.model == 'MLP': model = MLP(word_dim=args.word_dim, hidden_size=100, vocab_size=len(word_to_id)) elif args.model == 'BiLSTM': model = BiLSTM(word_dim=args.word_dim, hidden_size=100, vocab_size=len(word_to_id)) elif args.model == 'BiLSTMAttn': model = BiLSTMAttn(word_dim=args.word_dim, hidden_size=100, vocab_size=len(word_to_id)) elif args.model == 'CNN': model = CNN(word_dim=args.word_dim, word_lim=args.word_lim, vocab_size=len(word_to_id)) else: raise ValueError( f'model name should be "MLP", "BiLSTM", "BiLSTMAttn", or "CNN", but given {args.model}' ) model.set_initial_embedding(initial_embedding) model.to(device) # build optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) best_valid_acc = -1 for epoch in range(1, args.epochs + 1): print(f'*** epoch {epoch} ***') # train model.train() total_loss = 0 total_correct = 0 for batch_idx, (source, mask, target) in enumerate(train_data_loader): source = source.to(device) # (b, len) mask = mask.to(device) # (b, len) target = target.to(device) # (b) # Forward pass output = model(source, mask) # (b, 2) loss = loss_fn(output, target) # Backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() total_correct += metric_fn(output, target) print(f'train_loss={total_loss / train_data_loader.n_samples:.3f}', end=' ') print( f'train_accuracy={total_correct / train_data_loader.n_samples:.3f}' ) # validation model.eval() with torch.no_grad(): total_loss = 0 total_correct = 0 for batch_idx, (source, mask, target) in enumerate(valid_data_loader): source = source.to(device) # (b, len) mask = mask.to(device) # (b, len) target = target.to(device) # (b) output = model(source, mask) # (b, 2) total_loss += loss_fn(output, target) total_correct += metric_fn(output, target) valid_acc = total_correct / valid_data_loader.n_samples print(f'valid_loss={total_loss / valid_data_loader.n_samples:.3f}', end=' ') print(f'valid_accuracy={valid_acc:.3f}\n') if valid_acc > best_valid_acc: torch.save(model.state_dict(), args.save_path) best_valid_acc = valid_acc
model = VAE(device).to(device) if args.pretrained != 'None': model.load_state_dict(torch.load(args.pretrained)) optimizer = Adam(model.parameters(), lr) scheduler = ReduceLROnPlateau(optimizer, 'min', patience=10, eps=1e-4) clip_norm = args.clip_norm criterion = loss_fn num_epochs = args.num_epochs logdir = './logdir' for epoch in range(args.num_epochs): for idx, images in enumerate(train_loader): recon_images, mu, logvar = model(images.to(device)) #print(recon_images, mu, logvar) loss, bce, kld = loss_fn(recon_images, images.to(device), mu, logvar) optimizer.zero_grad() loss.backward() optimizer.step() to_print = "Epoch[{}/{}] Loss: {:.3f} {:.3f} {:.3f}".format( epoch + 1, args.num_epochs, loss.data / args.train_batch_size, bce.data / args.train_batch_size, kld.data / args.train_batch_size) print(to_print) torch.save(model.state_dict(), args.output_weights) '''# model runner runner = SupervisedRunner() # model training runner.train( model=model,