def evaluate_network_from_embs(self, wembs, renew=True): params = self.params if renew: dy.renew_cg() builders = params["builders"] W = params["W"] v = params["v"] lstms = [b.initial_state() for b in builders] # wembs = [dy.noise(we, 0.1) for we in wembs] # running the first level for getting b fw_lstm1 = lstms[0].transduce(wembs) bw_lstm1 = reversed(lstms[1].transduce(reversed(wembs))) inputs_to_2nd_layer = [ dy.concatenate([f, b]) for f, b in zip(fw_lstm1, bw_lstm1) ] fw_lstm2 = lstms[2].transduce(inputs_to_2nd_layer) bw_lstm2 = reversed(lstms[3].transduce(reversed(inputs_to_2nd_layer))) y = [dy.concatenate([f, b]) for f, b in zip(fw_lstm2, bw_lstm2)] tags_hat = [W * t + v for t in y] return tags_hat
def do_cpu(): C.renew_cg() W = C.parameter(cpW) W = W*W*W*W*W*W*W z = C.squared_distance(W,W) z.value() z.backward()
def predict(self, task, inputs): preds = [] for input in inputs: dn.renew_cg() out = self(task, input) preds.append(np.argmax(out.npvalue())) return preds
def build_representations_bi(self, sentence, training, prefix=[], do_not_renew=False): if not do_not_renew: dy.renew_cg(immediate_compute=True, check_validity=True) coded_sentence = self.vocabulary.code_sentence_cw(sentence, training) coded_prefix = self.vocabulary.code_sentence_cw(prefix, training) w_init_f = self.wrnn[F].initial_state() w_init_b = self.wrnn[B].initial_state() f_lstm_input = self.get_static_representations(coded_prefix + coded_sentence) b_lstm_input = self.get_static_representations( coded_prefix + list(reversed(coded_sentence))) contextual_embeddings = [ w_init_f.transduce(f_lstm_input), list(reversed(w_init_b.transduce(b_lstm_input))) ] return (dy.concatenate([ contextual_embeddings[F][-1], contextual_embeddings[B][0] ]), [dy.concatenate(list(fb)) for fb in zip(*contextual_embeddings)])
def predict(self, X_test, x_y_vectors=None): """ Predict the classification of the test set """ model = self.model model_parameters = self.model_parameters builder = self.builder test_pred = [] # Predict every 100 instances together for chunk in xrange(0, len(X_test), MINIBATCH_SIZE): dy.renew_cg() path_cache = {} test_pred.extend([ np.argmax( process_one_instance( builder, model, model_parameters, path_set, path_cache, self.update, dropout=0.0, x_y_vectors=x_y_vectors[chunk + i] if x_y_vectors is not None else None, num_hidden_layers=self.num_hidden_layers).npvalue()) for i, path_set in enumerate(X_test[chunk:chunk + MINIBATCH_SIZE]) ]) return test_pred
def predict_with_score(self, X_test, x_y_vectors=None): """ Predict the classification of the test set """ model = self.model builder = self.builder dy.renew_cg() path_cache = {} test_pred = [ process_one_instance( builder, model, path_set, path_cache, self.update, dropout=0.0, x_y_vectors=x_y_vectors[i] if x_y_vectors is not None else None, num_hidden_layers=self.num_hidden_layers).npvalue() for i, path_set in enumerate(X_test) ] return [(np.argmax(vec), vec[np.argmax(vec)]) for vec in test_pred]
def epoch_train(self,examples): count=0 dy.renew_cg() current_losses = [ ] loss_list = [] for word,context in (examples): loss = self.get_score(word,context) current_losses.append(loss) loss_list.append(loss.value()) if len(current_losses) >= self.batch_size: mean_loss = dy.esum(current_losses) / float(len(current_losses)) mean_loss.forward() mean_loss.backward() self.optimizer.update() current_losses = [ ] dy.renew_cg() count+=1 ## Print out the average loss in every 1M example if count%1000000==1000: print (count,np.mean(np.array(loss_list))) loss_list = [] if current_losses: mean_loss = dy.esum(current_losses) / float(len(current_losses)) mean_loss.forward() mean_loss.backward() self.optimizer.update()
def predict(board_ins, board_init): act = [] count = 0 previous = None first = True for sentence, env in zip(board_ins, board_init): if count % 5 != 0: new_sentence = pre_sentence + ' <end> ' + sentence new_env = str(execute(new_env, generate)) if new_env == 'None': new_env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_' else: dy.renew_cg() new_sentence = sentence new_env = env generate, previous = generator(encoder, decoder, params_encoder, params_decoder, new_sentence, new_env, first, previous) act.append(generate) pre_sentence = sentence count += 1 while '<end>' in generate: generate.remove('<end>') env_list = [] final_env_list = [] for i, env in enumerate(board_init): if i % 5 == 0: new_env = env new_env = str(execute(new_env, act[i])) if new_env == 'None': new_env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_' env_list.append(new_env) if i % 5 == 4: final_env_list.append(new_env) return env_list, final_env_list
def compute_batch_loss(encoder, decoder, batch_input_seqs, batch_output_seqs, y2int): # renew computation graph per batch dn.renew_cg() batch_size = len(batch_input_seqs) # encode batch with bilstm encoder: each element represents one step in time, and is a matrix of 2*h x batch size # for example, for sentence length of 12, blstm_outputs wil be: 12 x 2 x 100 x 16 # note: also adding begin_seq, end_seq symbols here! encoded_inputs, input_masks = encoder.encode_batch(batch_input_seqs) # concatenate the end seq symbols to the output sequence padded_batch_output_seqs = [ seq + [common.END_SEQ] for seq in batch_output_seqs ] # get output word ids for each step of the decoder output_word_ids, output_masks, output_tot = common.get_batch_word_ids( padded_batch_output_seqs, y2int) total_batch_loss = decoder.compute_decoder_batch_loss( encoded_inputs, input_masks, output_word_ids, output_masks, batch_size) return total_batch_loss
def get_output(self, sents): dy.renew_cg() tagged_sents = [] for sent in sents: features, t_feats, _ = self.get_features_for_tagging(sent, False) cur_tag_seq, _ = self.crf_module.viterbi_decoding(features, t_feats) tagged_sents.append(cur_tag_seq) return tagged_sents
def predict(self, feature_vector, task_ids, train=False, soft_labels=False, temperature=None, dropout_rate=0.0, orthogonality_weight=0.0, domain_id=None): dynet.renew_cg() # new graph feature_vector = feature_vector.toarray() feature_vector = np.squeeze(feature_vector, axis=0) # self.input = dynet.vecInput(self.vocab_size) # self.input.set(feature_vector) # TODO this takes too long; can we speed this up somehow? input = dynet.inputVector(feature_vector) for i in range(self.h_layers): if train: # add some noise input = dynet.noise(input, self.noise_sigma) input = dynet.dropout(input, dropout_rate) input = self.layers[i](input) outputs = [] for task_id in task_ids: output = self.output_layers_dict[task_id](input, soft_labels=soft_labels, temperature=temperature) outputs.append(output) constraint, adv_loss = 0, 0 if orthogonality_weight != 0: # put the orthogonality constraint either directly on the # output layer or on the hidden layer if it's an MLP F0_layer = self.output_layers_dict["F0"] F1_layer = self.output_layers_dict["F1"] F0_param = F0_layer.W_mlp if self.add_hidden else F0_layer.W F1_param = F1_layer.W_mlp if self.add_hidden else F1_layer.W F0_W = dynet.parameter(F0_param) F1_W = dynet.parameter(F1_param) # calculate the matrix product of the task matrix with both others matrix_product = dynet.transpose(F0_W) * F1_W # take the squared Frobenius norm by squaring # every element and then summing them squared_frobenius_norm = dynet.sum_elems( dynet.square(matrix_product)) constraint += squared_frobenius_norm # print('Constraint with first matrix:', squared_frobenius_norm.value()) if domain_id is not None: # flip the gradient when back-propagating through here adv_input = dynet.flip_gradient(input) # last state adv_output = self.adv_layer(adv_input) adv_loss = self.pick_neg_log(adv_output, domain_id) # print('Adversarial loss:', avg_adv_loss.value()) return outputs, constraint, adv_loss
def generate(self, num, limit=40, beam=3): dy.renew_cg() generated = [] W = dy.parameter(self.W) b = dy.parameter(self.b) for wordi in range(num): # Initialize the LSTM state with EOW token. start_state = self.lstm.initial_state() start_state = start_state.add_input(self.lookup[self.c2i[EOW]]) best_states = [('', start_state, 0)] final_hypotheses = [] # Perform beam search. while len(final_hypotheses) < beam and len(best_states) > 0: new_states = [] for hyp, s, p in best_states: # Cutoff when we exceed the character limit. if len(hyp) >= limit: final_hypotheses.append((hyp, p)) continue # Get the prediction from the current LSTM state. unnormalized = dy.affine_transform([b, W, s.output()]) softmax = dy.softmax(unnormalized).npvalue() # Sample beam number of times. for beami in range(beam): ci = sample_softmax(softmax) c = self.i2c[ci] next_p = softmax[ci] logp = p - np.log(next_p) if c == EOW: # Add final hypothesis if we reach end of word. final_hypotheses.append((hyp, logp)) else: # Else add to states to search next time step. new_states.append((hyp + c, s.add_input(self.lookup[ci]), logp)) # Sort and prune the states to within the beam. new_states.sort(key=lambda t: t[-1]) best_states = new_states[:beam] final_hypotheses.sort(key=lambda t: t[-1]) generated.append(final_hypotheses[0][0]) return generated
def evaluate_network(self, t): dy.renew_cg() W = self.params["W"][0] b = self.params["b"][0] x = dy.vecInput(2) x.set(t) if self.with_bias[0]: output = derlu(W * (x + b)) else: output = derlu(W * x) self.last_output = output return output
def do_cpu(): import _dynet as C C.init() cm = C.Model() cpW = cm.add_parameters((1000, 1000)) s = time.time() C.renew_cg() W = C.parameter(cpW) W = W * W * W * W * W * W * W z = C.squared_distance(W, W) z.value() z.backward() print("CPU time:", time.time() - s)
def do_cpu(): import _dynet as C C.init() cm = C.Model() cpW = cm.add_parameters((1000,1000)) s = time.time() C.renew_cg() W = C.parameter(cpW) W = W*W*W*W*W*W*W z = C.squared_distance(W,W) z.value() z.backward() print("CPU time:",time.time() - s)
def construct_vector(self, sentence_as_char_codes): params = self.params dy.renew_cg() builder = params["builders"][0] E = params["E"] sentence_as_wembs = [] for word in sentence_as_char_codes: char_lstm = builder.initial_state() cembs = [E[char] for char in word] # running the char-level lstm word_vec = char_lstm.transduce(cembs)[-1] sentence_as_wembs.append(word_vec) return sentence_as_wembs
def train(epoch, trees, policy, trainer, best, best_idx, wrong_total_l): # hyper edge fragment hyper-prec hyper-recall root metric_total = [0] * 6 micro_total = [0.] * 3 wrong_at_total = [0.] * 10 np.random.shuffle(trees) loss = 0 policy.set_dropout(args.path_dropout_rate) for i_episode in tqdm(range(len(trees))): T = trees[i_episode] entropy_l = [] dy.renew_cg() policy.re_init() for _ in range(args.n_rollout): # prob_l = [] policy.init_history() policy.rewards.append([]) policy.saved_actions.append([]) while len(T.V) > 0: pair, pr, entropy = select_action(T, policy, choose_max=False, mode='train') if pair is None: break entropy_l.append(entropy) # prob_l.append(pr) T.update(pair) if args.reward_form != 'last' or len(T.V) == 0: reward = T.eval(reward_type=args.reward, reward_form=args.reward_form) else: reward = 0 policy.rewards[-1].append(reward) metric_total, micro_total, wrong_at_total, wrong_total = T.evaluate(metric_total, micro_total, wrong_at_total, reward_type='print') wrong_total_l.append(wrong_total) # scores_save.append(T.evaluate(reward_type=REWARD, return_all=True)) # prob_save.append(prob_l) T.re_init() loss += finish_episode(policy, trainer, entropy_l) for m_idx in range(5): metric_total[m_idx] = round(metric_total[m_idx] / len(trees) / args.n_rollout, 3) metric_total[0] = T.f1_calc(metric_total[3], metric_total[4]) for w_idx in range(len(wrong_at_total)): wrong_at_total[w_idx] = round(wrong_at_total[w_idx] / len(trees) / args.n_rollout, 3) metric_total[5] /= args.n_rollout best, best_idx = update_best(metric_total, best, best_idx, epoch) if epoch % 1 == 0: print '[train]epoch {}:{} {} {} {}'.format(epoch, metric_total, micro_total, get_micro_f1(micro_total), wrong_at_total), print 'total_loss', loss, 'best', best, best_idx return best, best_idx
def evaluate_network_from_sentence(self, sentence): dy.renew_cg() E = self.p3a.params["E"] E_pre = self.params["E_pre"] E_suf = self.params["E_suf"] input_vectors = [] pre_suf_pairs = self.encoder.encode_sentence_prefix_suffix(sentence) sentence_codes = self.encoder.encode_sentence_words(sentence) for i in range(len(sentence)): vec = E[sentence_codes[i][0]] pre_code, suf_code = pre_suf_pairs[i] if pre_code >= 0: vec += E_pre[pre_code] if suf_code >= 0: vec += E_suf[suf_code] input_vectors.append(vec) return self.p3a.common.evaluate_network_from_embs(input_vectors, False)
def do_gpu(): import _dynet as G import sys sys.argv.append('--dynet-devices') sys.argv.append('GPU:0') G.init() gm = G.Model() gpW = gm.add_parameters((1000, 1000)) s = time.time() G.renew_cg() W = G.parameter(gpW) W = W * W * W * W * W * W * W z = G.squared_distance(W, W) z.value() z.backward() print("GPU time:", time.time() - s)
def do_gpu(): import _dynet as G import sys sys.argv.append('--dynet-devices') sys.argv.append('GPU:0') G.init() gm = G.Model() gpW = gm.add_parameters((1000,1000)) s = time.time() G.renew_cg() W = G.parameter(gpW) W = W*W*W*W*W*W*W z = G.squared_distance(W,W) z.value() z.backward() print("GPU time:",time.time() - s)
def predict(self, seq, train=False, output_confidences=False, unk_tag=None, update_embeds=True): """ predict tags for a sentence represented as char+word embeddings and compute losses for this instance """ if not train: dynet.renew_cg() features = self.get_features(seq.words, train=train, update=update_embeds) output_expected_at_layer = self.predictors["task_expected_at"][seq.task_id] output_expected_at_layer -=1 # go through layers # input is now combination of w + char emb prev = features prev_rev = features num_layers = self.h_layers for i in range(0,num_layers): predictor = self.predictors["inner"][i] forward_sequence, backward_sequence = predictor.predict_sequence(prev, prev_rev) if i > 0 and self.activation: # activation between LSTM layers forward_sequence = [self.activation(s) for s in forward_sequence] backward_sequence = [self.activation(s) for s in backward_sequence] if i == output_expected_at_layer: output_predictor = self.predictors["output_layers_dict"][seq.task_id] concat_layer = [dynet.concatenate([f, b]) for f, b in zip(forward_sequence,reversed(backward_sequence))] if train and self.noise_sigma > 0.0: concat_layer = [dynet.noise(fe,self.noise_sigma) for fe in concat_layer] # fill-in predictions and get loss per tag losses = output_predictor.predict_sequence(seq, concat_layer, train=train, output_confidences=output_confidences, unk_tag=unk_tag, dictionary=self.dictionary, type_constraint=self.type_constraint) prev = forward_sequence prev_rev = backward_sequence if train: # return losses return losses else: return seq.pred_tags, seq.tag_confidences
def __init__(self,word_size,context_fre, context_size,vocab,window=2,subsample_n=2000,mode='bow',embed_size=200, batch_size=128,num_sampled=5, epoch=6): self.embed_size = embed_size self.mode = mode self.window = window self.vocab = vocab self.word_size = word_size self.subsample_n = subsample_n self.context_size = context_size self.num_sampled = num_sampled self.epoch = epoch self.context_fre = context_fre self.batch_size=batch_size self.pc = dy.ParameterCollection() self.optimizer = dy.AdamTrainer(self.pc) self.word_embeddings = self.pc.add_lookup_parameters((self.word_size, self.embed_size), name="word-embeddings") self.context_embeddings = self.pc.add_lookup_parameters((self.context_size, self.embed_size), name="context-embeddings") dy.renew_cg() print ([(param.name(), param.shape()) for param in self.pc.lookup_parameters_list() + self.pc.parameters_list()])
def create_network_params(nwords, ntags, external_E=None): # create a parameter collection and add the parameters. print("adding parameters") m = dy.ParameterCollection() print("nwords: {}".format(nwords)) E = m.add_lookup_parameters((nwords, EMB), name='E') if external_E and sum(external_E.shape) > 0: assert external_E.shape[1] == EMB external_rows = external_E.shape[0] for r in range(external_rows): E.init_row(r, external_E[r, :]) b = m.add_parameters(HIDDEN, name='b') U = m.add_parameters((ntags, HIDDEN), name='U') W = m.add_parameters((HIDDEN, INPUT), name='W') bp = m.add_parameters(ntags, name='bp') dy.renew_cg() return m, E, b, U, W, bp
def get_top_k_paths(self, all_paths, relation_index, threshold): """ Get the top k scoring paths """ builder = self.builder model = self.model model_parameters = self.model_parameters lemma_lookup = model_parameters['lemma_lookup'] pos_lookup = model_parameters['pos_lookup'] dep_lookup = model_parameters['dep_lookup'] dir_lookup = model_parameters['dir_lookup'] path_scores = [] for i, path in enumerate(all_paths): if i % 1000 == 0: cg = dy.renew_cg() W1 = dy.parameter(model_parameters['W1']) b1 = dy.parameter(model_parameters['b1']) W2 = None b2 = None if self.num_hidden_layers == 1: W2 = dy.parameter(model_parameters['W2']) b2 = dy.parameter(model_parameters['b2']) path_embedding = get_path_embedding(builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path) if self.use_xy_embeddings: zero_word = dy.inputVector([0.0] * self.lemma_embeddings_dim) path_embedding = dy.concatenate( [zero_word, path_embedding, zero_word]) h = W1 * path_embedding + b1 if self.num_hidden_layers == 1: h = W2 * dy.tanh(h) + b2 path_score = dy.softmax(h).npvalue().T path_scores.append(path_score) path_scores = np.vstack(path_scores) top_paths = [] for i in range(len(relation_index)): indices = np.argsort(-path_scores[:, i]) top_paths.append([ (all_paths[index], path_scores[index, i]) for index in indices if threshold is None or path_scores[index, i] >= threshold ]) return top_paths
def evaluate_adversary(self, dataset): loss = 0 acc = 0 tot = len(dataset) predictions = [] for i, ex in enumerate(dataset): dy.renew_cg() vec, labels = ex vec = dy.inputVector(vec) l, p = self.adversary_classifier.get_loss_and_prediction(vec, labels) predictions.append(p) if p == labels: acc += 1 loss += l.value() return loss / tot, acc / tot * 100, predictions
def build_representations_mono(self, sentence, training, prefix=[], do_not_renew=False): if not do_not_renew: dy.renew_cg(immediate_compute=True, check_validity=True) coded_sentence = self.vocabulary.code_sentence_cw(sentence, training) coded_prefix = self.vocabulary.code_sentence_cw(prefix, training) #print(prefix) #print(coded_prefix) w_init_f = self.wrnn[F].initial_state() f_lstm_input = self.get_static_representations(coded_prefix + coded_sentence) contextual_embeddings = w_init_f.transduce(f_lstm_input) return (contextual_embeddings[-1], contextual_embeddings)
def train(network, trainer, words, epochs, batch_size=100, max_batch_num=5, callback=lambda *args: None): last_loss = None batch_num = min(len(words) // batch_size + 1, max_batch_num) for enum in range(epochs): eloss = 0 bnum = 0 for bi in range(batch_num): bwords = np.random.choice(words, size=batch_size, replace=True) if len(bwords) < 1: continue dy.renew_cg() loss = network.train_batch(bwords) eloss += loss.value() loss.backward() trainer.update() bnum += 1 eloss = eloss / bnum if last_loss: last_loss = 0.95 * last_loss + 0.05 * eloss else: last_loss = eloss # print('Epoch {} loss: {:.6f} Running avg.: {:.6f}'.format( # enum + 1, eloss, last_loss)) callback(enum, eloss, last_loss) return last_loss
def calculate_loss(self, sents): dy.renew_cg() losses = [] for sent in sents: features, t_features, feat_reconstruct = self.get_features_for_tagging( sent, True ) gold_tags = [tag for chars, word, feats, tag in sent] cur_loss = self.crf_module.negative_log_loss( features, t_features, gold_tags ) if self.autoencoder: autoencoder_loss = [ dy.binary_log_loss(reconstruct, dy.inputTensor(feats)) for reconstruct, (chars, word, feats, tag) in zip( feat_reconstruct, sent ) ] else: # remove autoencoder loss autoencoder_loss = [dy.scalarInput(0)] losses.append(cur_loss + (dy.esum(autoencoder_loss) / self.featsize)) return dy.esum(losses)
def test(parser, testing_data, evalb_dir, unsupervised=False, test_bert_embeddings=None): test_predicted = [] for idx, data in enumerate(testing_data): dy.renew_cg() if test_bert_embeddings is None: predicted = parser.parse(data, False) else: predicted = parser.parse(data, False, test_bert_embeddings[idx]) test_predicted.append(predicted) if unsupervised: test_predicted_errs = [x[0] for x in test_predicted] test_predicted_span_sets = [x[1] for x in test_predicted] test_fscore = evaluate.evalb_US(testing_data, test_predicted_span_sets) test_ppl = evaluate.evalb_ppl(testing_data, test_predicted_errs) return test_fscore, test_ppl else: test_fscore = evaluate.evalb(testing_data, test_predicted) # test_fscore = evaluate.evalb_tag(testing_data, test_predicted) return test_fscore
def predict(self, feature_vector, train=False, soft_labels=False, temperature=None, dropout_rate=None): dynet.renew_cg() # new graph feature_vector = feature_vector.toarray() feature_vector = np.squeeze(feature_vector, axis=0) # self.input = dynet.vecInput(self.vocab_size) # self.input.set(feature_vector) # TODO this takes too long; can we speed this up somehow? input = dynet.inputVector(feature_vector) for i in range(self.h_layers - 1): if train: # add some noise input = dynet.noise(input, self.noise_sigma) input = dynet.dropout(input, dropout_rate) input = self.layers[i](input) output = self.layers[-1](input, soft_labels=soft_labels, temperature=temperature) return output
def main(): parser = argparse.ArgumentParser( description= 'Convolutional Neural Networks for Sentence Classification in DyNet') parser.add_argument('--gpu', type=int, default=0, help='GPU ID to use. For cpu, set -1 [default: 0]') parser.add_argument( '--train_x_path', type=str, default='./data/train_x.txt', help='File path of train x data [default: `./data/train_x.txt`]') parser.add_argument( '--train_y_path', type=str, default='./data/train_y.txt', help='File path of train y data [default: `./data/train_x.txt`]') parser.add_argument( '--valid_x_path', type=str, default='./data/valid_x.txt', help='File path of valid x data [default: `./data/valid_x.txt`]') parser.add_argument( '--valid_y_path', type=str, default='./data/valid_y.txt', help='File path of valid y data [default: `./data/valid_y.txt`]') parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs [default: 10]') parser.add_argument('--batch_size', type=int, default=64, help='Mini batch size [default: 64]') parser.add_argument('--win_sizes', type=int, nargs='*', default=[3, 4, 5], help='Window sizes of filters [default: [3, 4, 5]]') parser.add_argument( '--num_fil', type=int, default=100, help='Number of filters in each window size [default: 100]') parser.add_argument('--s', type=float, default=3.0, help='L2 norm constraint on w [default: 3.0]') parser.add_argument('--dropout_prob', type=float, default=0.5, help='Dropout probability [default: 0.5]') parser.add_argument( '--v_strategy', type=str, default='static', help= 'Embedding strategy. rand: Random initialization. static: Load pretrained embeddings and do not update during the training. non-static: Load pretrained embeddings and update during the training. [default: static]' ) parser.add_argument( '--alloc_mem', type=int, default=4096, help='Amount of memory to allocate [mb] [default: 4096]') args = parser.parse_args() print(args) os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) N_EPOCHS = args.n_epochs WIN_SIZES = args.win_sizes BATCH_SIZE = args.batch_size EMB_DIM = 300 OUT_DIM = 1 L2_NORM_LIM = args.s NUM_FIL = args.num_fil DROPOUT_PROB = args.dropout_prob V_STRATEGY = args.v_strategy ALLOC_MEM = args.alloc_mem if V_STRATEGY in ['rand', 'static', 'non-static']: NUM_CHA = 1 else: NUM_CHA = 2 # FILE paths W2V_PATH = './GoogleNews-vectors-negative300.bin' TRAIN_X_PATH = args.train_x_path TRAIN_Y_PATH = args.train_y_path VALID_X_PATH = args.valid_x_path VALID_Y_PATH = args.valid_y_path # DyNet setting dyparams = dy.DynetParams() dyparams.set_random_seed(RANDOM_SEED) dyparams.set_mem(ALLOC_MEM) dyparams.init() # Load pretrained embeddings pretrained_model = gensim.models.KeyedVectors.load_word2vec_format( W2V_PATH, binary=True) vocab = pretrained_model.wv.vocab.keys() w2v = pretrained_model.wv # Build dataset ======================================================================================================= w2c = build_w2c(TRAIN_X_PATH, vocab=vocab) w2i, i2w = build_w2i(TRAIN_X_PATH, w2c, unk='unk') train_x, train_y = build_dataset(TRAIN_X_PATH, TRAIN_Y_PATH, w2i, unk='unk') valid_x, valid_y = build_dataset(VALID_X_PATH, VALID_Y_PATH, w2i, unk='unk') train_x, train_y = sort_data_by_length(train_x, train_y) valid_x, valid_y = sort_data_by_length(valid_x, valid_y) VOCAB_SIZE = len(w2i) print('VOCAB_SIZE:', VOCAB_SIZE) V_init = init_V(w2v, w2i) with open(os.path.join(RESULTS_DIR, './w2i.dump'), 'wb') as f_w2i, open(os.path.join(RESULTS_DIR, './i2w.dump'), 'wb') as f_i2w: pickle.dump(w2i, f_w2i) pickle.dump(i2w, f_i2w) # Build model ================================================================================= model = dy.Model() trainer = dy.AdamTrainer(model) # V1 V1 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) if V_STRATEGY in ['static', 'non-static', 'multichannel']: V1.init_from_array(V_init) if V_STRATEGY in ['static', 'multichannel']: V1_UPDATE = False else: # 'rand', 'non-static' V1_UPDATE = True make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM) # V2 if V_STRATEGY == 'multichannel': V2 = model.add_lookup_parameters((VOCAB_SIZE, EMB_DIM)) V2.init_from_array(V_init) V2_UPDATE = True make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM) layers = [ CNNText(model, EMB_DIM, WIN_SIZES, NUM_CHA, NUM_FIL, dy.tanh, DROPOUT_PROB), Dense(model, 3 * NUM_FIL, OUT_DIM, dy.logistic) ] # Train model ================================================================================ n_batches_train = math.ceil(len(train_x) / BATCH_SIZE) n_batches_valid = math.ceil(len(valid_x) / BATCH_SIZE) start_time = time.time() for epoch in range(N_EPOCHS): # Train loss_all_train = [] pred_all_train = [] for i in tqdm(range(n_batches_train)): # Create a new computation graph dy.renew_cg() associate_parameters(layers) # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE x = build_batch(train_x[start:end], w2i, max(WIN_SIZES)).T t = np.array(train_y[start:end]) sen_len = x.shape[0] if V_STRATEGY in ['rand', 'static', 'non-static']: x_embs = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) else: # multichannel x_embs1 = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs2 = dy.concatenate_cols( [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x]) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) t = dy.inputTensor(t, batched=True) y = forwards(layers, x_embs, test=False) mb_loss = dy.mean_batches(dy.binary_log_loss(y, t)) # Forward prop loss_all_train.append(mb_loss.value()) pred_all_train.extend(list(binary_pred(y.npvalue().flatten()))) # Backward prop mb_loss.backward() trainer.update() # L2 norm constraint layers[1].scale_W(L2_NORM_LIM) # Make padding embs zero if V_STRATEGY in ['rand', 'non-static']: make_emb_zero(V1, [w2i['<s>'], w2i['</s>']], EMB_DIM) elif V_STRATEGY in ['multichannel']: make_emb_zero(V2, [w2i['<s>'], w2i['</s>']], EMB_DIM) # Valid loss_all_valid = [] pred_all_valid = [] for i in range(n_batches_valid): # Create a new computation graph dy.renew_cg() associate_parameters(layers) # Create a mini batch start = i * BATCH_SIZE end = start + BATCH_SIZE x = build_batch(valid_x[start:end], w2i, max(WIN_SIZES)).T t = np.array(valid_y[start:end]) sen_len = x.shape[0] if V_STRATEGY in ['rand', 'static', 'non-static']: x_embs = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs = dy.transpose(x_embs) x_embs = dy.reshape(x_embs, (sen_len, EMB_DIM, 1)) else: # multichannel x_embs1 = dy.concatenate_cols( [dy.lookup_batch(V1, x_t, update=V1_UPDATE) for x_t in x]) x_embs2 = dy.concatenate_cols( [dy.lookup_batch(V2, x_t, update=V2_UPDATE) for x_t in x]) x_embs1 = dy.transpose(x_embs1) x_embs2 = dy.transpose(x_embs2) x_embs = dy.concatenate([x_embs1, x_embs2], d=2) t = dy.inputTensor(t, batched=True) y = forwards(layers, x_embs, test=True) mb_loss = dy.mean_batches(dy.binary_log_loss(y, t)) # Forward prop loss_all_valid.append(mb_loss.value()) pred_all_valid.extend(list(binary_pred(y.npvalue().flatten()))) print( 'EPOCH: %d, Train Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Valid Loss:: %.3f (F1:: %.3f, Acc:: %.3f), Time:: %.3f[s]' % ( epoch + 1, np.mean(loss_all_train), f1_score(train_y, pred_all_train), accuracy_score(train_y, pred_all_train), np.mean(loss_all_valid), f1_score(valid_y, pred_all_valid), accuracy_score(valid_y, pred_all_valid), time.time() - start_time, )) # Save model ========================================================================================================================= if V_STRATEGY in ['rand', 'static', 'non-static']: dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)), [V1] + layers) else: dy.save(os.path.join(RESULTS_DIR, './model_e' + str(epoch + 1)), [V1, V2] + layers)
def fit(self, train, num_iterations, dev=None, model_path=None, patience=0, minibatch_size=0, log_losses=False): """ train the tagger """ losses_log = {} # log losses print("init parameters") self.init_parameters(train) # init lookup parameters and define graph print("build graph") self.build_computation_graph(len(self.w2i), len(self.c2i)) update_embeds = True if self.backprob_embeds == False: ## disable backprob into embeds print(">>> disable wembeds update <<<") update_embeds = False best_val_acc, epochs_no_improvement = 0.0, 0 if dev and model_path is not None and patience > 0: print('Using early stopping with patience of {}...'.format(patience)) batch = [] print("train..") for iteration in range(num_iterations): total_loss=0.0 total_tagged=0.0 indices = [i for i in range(len(train.seqs))] random.shuffle(indices) loss_accum_loss = defaultdict(float) loss_accum_tagged = defaultdict(float) for idx in indices: seq = train.seqs[idx] if seq.task_id not in losses_log: losses_log[seq.task_id] = [] #initialize if minibatch_size > 1: # accumulate instances for minibatch update loss1 = self.predict(seq, train=True, update_embeds=update_embeds) total_tagged += len(seq.words) batch.append(loss1) if len(batch) == minibatch_size: loss = dynet.esum(batch) total_loss += loss.value() # logging loss_accum_tagged[seq.task_id] += len(seq.words) loss_accum_loss[seq.task_id] += loss.value() loss.backward() self.trainer.update() dynet.renew_cg() # use new computational graph for each BATCH when batching is active batch = [] else: dynet.renew_cg() # new graph per item loss1 = self.predict(seq, train=True, update_embeds=update_embeds) total_tagged += len(seq.words) lv = loss1.value() total_loss += lv # logging loss_accum_tagged[seq.task_id] += len(seq.words) loss_accum_loss[seq.task_id] += loss1.value() loss1.backward() self.trainer.update() print("iter {2} {0:>12}: {1:.2f}".format("total loss", total_loss/total_tagged, iteration)) # log losses for task_id in sorted(losses_log): losses_log[task_id].append(loss_accum_loss[task_id] / loss_accum_tagged[task_id]) if log_losses: dill.dump(losses_log, open(model_path + ".model" + ".losses.pickle", "wb")) if dev: # evaluate after every epoch correct, total = self.evaluate(dev, "task0") val_accuracy = correct/total print("dev accuracy: {0:.4f}".format(val_accuracy)) if val_accuracy > best_val_acc: print('Accuracy {0:.4f} is better than best val accuracy ' '{1:.4f}.'.format(val_accuracy, best_val_acc)) best_val_acc = val_accuracy epochs_no_improvement = 0 save(self, model_path) else: print('Accuracy {0:.4f} is worse than best val loss {1:.4f}.'.format(val_accuracy, best_val_acc)) epochs_no_improvement += 1 if patience > 0: if epochs_no_improvement == patience: print('No improvement for {} epochs. Early stopping...'.format(epochs_no_improvement)) break
def create_computation_graph(num_lemmas, num_pos, num_dep, num_directions, num_relations, wv=None, use_xy_embeddings=False, num_hidden_layers=0, lemma_dimension=50): """ Initialize the model :param num_lemmas Number of distinct lemmas :param num_pos Number of distinct part of speech tags :param num_dep Number of distinct depenedency labels :param num_directions Number of distinct path directions (e.g. >,<) :param num_relations Number of classes (e.g. binary = 2) :param wv Pre-trained word embeddings file :param use_xy_embeddings Whether to concatenate x and y word embeddings to the network input :param num_hidden_layers The number of hidden layers for the term-pair classification network :param lemma_dimension The dimension of the lemma embeddings :return: """ # model = Model() -- gives error? tried to fix by looking at dynet tutorial examples -- GB dy.renew_cg() model = dy.ParameterCollection() network_input = LSTM_HIDDEN_DIM builder = dy.LSTMBuilder(NUM_LAYERS, lemma_dimension + POS_DIM + DEP_DIM + DIR_DIM, network_input, model) # Concatenate x and y if use_xy_embeddings: network_input += 2 * lemma_dimension # 'the optimal size of the hidden layer is usually between the size of the input and size of the output layers' hidden_dim = int((network_input + num_relations) / 2) model_parameters = {} if num_hidden_layers == 0: model_parameters['W1'] = model.add_parameters( (num_relations, network_input)) model_parameters['b1'] = model.add_parameters((num_relations, 1)) elif num_hidden_layers == 1: model_parameters['W1'] = model.add_parameters( (hidden_dim, network_input)) model_parameters['b1'] = model.add_parameters((hidden_dim, 1)) model_parameters['W2'] = model.add_parameters( (num_relations, hidden_dim)) model_parameters['b2'] = model.add_parameters((num_relations, 1)) else: raise ValueError('Only 0 or 1 hidden layers are supported') model_parameters['lemma_lookup'] = model.add_lookup_parameters( (num_lemmas, lemma_dimension)) # Pre-trained word embeddings if wv is not None: model_parameters['lemma_lookup'].init_from_array(wv) model_parameters['pos_lookup'] = model.add_lookup_parameters( (num_pos, POS_DIM)) model_parameters['dep_lookup'] = model.add_lookup_parameters( (num_dep, DEP_DIM)) model_parameters['dir_lookup'] = model.add_lookup_parameters( (num_directions, DIR_DIM)) return builder, model, model_parameters