def read_vocab(vocab_file, vocab_limit): if vocab_file.endswith(".json"): vocab = load_json(vocab_file) else: vocab = {l.strip(): c for c, l in enumerate(line_reader(vocab_file))} assert vocab["<s>"] == 0 return {w: i for w, i in vocab.items() if i < vocab_limit}
def __init__(self, vocab_file, tag_vocab_file, vocab_limit): self.vocab = read_vocab(vocab_file, vocab_limit) self.tag_vocab = load_json(tag_vocab_file)
return e / len(frequent) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-input_dir", help="Directory containing model and vocabulary files.", required=True) parser.add_argument("-f", default="W_w", help="Model file (optional), meant for models per epoch.") parser.add_argument("-data_path", help="Filepath containing the SCWS dataset.", default="data/SCWS/ratings.txt") parser.add_argument("-win_size", default=3, type=int, help="Context window size (n words to the left and n to the right).") parser.add_argument("-n_most_freq", type=int, help="Only consider n most freq. words from vocabulary.") args = parser.parse_args() w_index_path = "{}/w_index.json".format(args.input_dir) # model_path = "{}/sg.pickle".format(args.input_dir) log.info("Loading model.") w_index = load_json(w_index_path) if args.n_most_freq: w_index = {w: i for w, i in w_index.items() if i < args.n_most_freq + 1} print(len(w_index)) embs = load_npy("{}/{}.npy".format(args.input_dir, args.f)) c_embs = load_npy("{}/W_c.npy".format(args.input_dir)) try: if args.f == "W_w": n = "" else: n = eval(args.f[-1]) assert 0 <= n < 9 bias = load_npy("{}/Wb{}.npy".format(args.input_dir, n)) except FileNotFoundError: bias = None
import argparse from bimu.utils.conll_utils import Conll07Reader from bimu.utils.generic_utils import save_json, load_json class TagVocab(dict): def update_tags(self, input_file): reader = Conll07Reader(input_file) for sent in reader: for tag in sent.cpos: if tag not in self: self[tag] = len(self) def write(self, output_file): save_json(self, output_file) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-input_file", required=True) parser.add_argument("-append", action="store_true") parser.add_argument("-output_file", required=True) args = parser.parse_args() tag_vocab = TagVocab() if args.append: tag_vocab.update(load_json(args.output_file)) tag_vocab.update_tags(args.input_file) tag_vocab.write(args.output_file)
def Sequence_Level(self, train_file, test_file, num_label, epochs, tag_vocab_file): log.debug("Declaring theano vars.") random.seed(5) W1 = theano.shared(0.2 * random.random([self.win * self.dimension, self.hidden1]) - 0.1) W2 = theano.shared(0.2 * random.random([self.hidden1, self.hidden2]) - 0.1) W3 = theano.shared(0.2 * random.random([self.hidden2, self.hidden3]) - 0.1) U = theano.shared(0.2 * random.random([self.hidden3, num_label]) - 0.1) x = T.dmatrix("x") # len(l) by win*dimension y = T.lvector("y") learn_rate = T.scalar("learn_rate") A1 = T.dot(x, W1) B1 = A1 * (A1 > 0) A2 = T.dot(B1, W2) B2 = A2 * (A2 > 0) A3 = T.dot(B2, W3) B3 = A3 * (A3 > 0) G = T.dot(B3, U) L1 = T.nnet.softmax(G) # len(l) by num_label #L1=T.nnet.softmax(T.dot(T.tanh(T.dot(T.tanh(T.dot(T.tanh(T.dot(x,W1)),W2)),W3)),U)) cost = T.nnet.categorical_crossentropy(L1, y).mean() gw1, gw2, gw3, gu = T.grad(cost, [W1, W2, W3, U]) #gw_x = T.grad(cost, [x]) log.info("Compiling theano model.") f1 = theano.function(inputs=[x, y, learn_rate], outputs=[cost], updates=( (W1, W1 - learn_rate * gw1), (W2, W2 - learn_rate * gw2), (W3, W3 - learn_rate * gw3), (U, U - learn_rate * gu))) #f2 = theano.function(inputs=[x, y], outputs=cost) prediction = T.argmax(L1, axis=1) discrepancy = prediction - y f3 = theano.function(inputs=[x, y], outputs=[discrepancy,prediction]) #f4 = theano.function(inputs=[x, y], outputs=gw_x) alpha = self.alpha log.info("Read-in the training and test data.") open_train = open(train_file, "r") train_lines = open_train.readlines() open_test = open(test_file, "r") test_lines = open_test.readlines() log.info("Start training.") counter = 0 start = time.time() iter_ = epochs for j in range(0, iter_): log.info("Epoch: {}...".format(j+1)) x_ = [] y_ = [] for i in range(len(train_lines)): if i % 1000 == 0: log.debug(i) counter = counter + 1 current_alpha = alpha * (iter_ * len(train_lines) - counter) / (iter_ * len(train_lines)) if current_alpha < 0.01: current_alpha = 0.01 line_ = train_lines[i] G = line_.split("|") token_line = G[0] label_line = G[1] token_list = list(fromstring(token_line, dtype=int, sep=' ')) x_ = self.contextwin(token_list) # len(l) by win*dimension y_ = fromstring(label_line, dtype=int, sep=' ') f1(x_, y_, current_alpha) total_num = 0 total_value = 0 goldlabels = [] predictions = [] goldlabels2 = [] predictions2 = [] for i in range(len(test_lines)): line_ = test_lines[i] G = line_.split("|") token_line = G[0].strip() label_line = G[1].strip() y = fromstring(label_line, dtype=int, sep=' ') x = self.contextwin(list(fromstring(token_line, dtype=int, sep=' '))) total_num = total_num + x.shape[0] discrep, preds = f3(x, y) goldlabels.extend(list(y)) goldlabels2.append(list(y)) predictions.extend(list(preds)) predictions2.append(list(preds)) total_value = total_value + x.shape[0] - count_nonzero(discrep) assert len(goldlabels) == len(predictions) # write out for evaluation with conlleval t_idx = load_json(tag_vocab_file) inv_t_idx = {i: t for t, i in t_idx.items()} with open("out", "w") as out: for gs, ps in zip(goldlabels2, predictions2): for g, p in zip(gs, ps): out.write("_ _ {} {}\n".format(inv_t_idx[g], inv_t_idx[p])) out.write("\n") log.info("f1 {}".format(f1_score(goldlabels, predictions, average="weighted"))) acc = 1.00 * total_value / total_num log.info("acc " + str(acc)) log.info("Training completed: {}s/epoch".format((time.time()-start)/iter_))
def Sequence_Level(self, train_file, test_file, num_label, epochs, tag_vocab_file): log.debug("Declaring theano vars.") random.seed(5) W1 = theano.shared( 0.2 * random.random([self.win * self.dimension, self.hidden1]) - 0.1) W2 = theano.shared(0.2 * random.random([self.hidden1, self.hidden2]) - 0.1) W3 = theano.shared(0.2 * random.random([self.hidden2, self.hidden3]) - 0.1) U = theano.shared(0.2 * random.random([self.hidden3, num_label]) - 0.1) x = T.dmatrix("x") # len(l) by win*dimension y = T.lvector("y") learn_rate = T.scalar("learn_rate") A1 = T.dot(x, W1) B1 = A1 * (A1 > 0) A2 = T.dot(B1, W2) B2 = A2 * (A2 > 0) A3 = T.dot(B2, W3) B3 = A3 * (A3 > 0) G = T.dot(B3, U) L1 = T.nnet.softmax(G) # len(l) by num_label #L1=T.nnet.softmax(T.dot(T.tanh(T.dot(T.tanh(T.dot(T.tanh(T.dot(x,W1)),W2)),W3)),U)) cost = T.nnet.categorical_crossentropy(L1, y).mean() gw1, gw2, gw3, gu = T.grad(cost, [W1, W2, W3, U]) #gw_x = T.grad(cost, [x]) log.info("Compiling theano model.") f1 = theano.function( inputs=[x, y, learn_rate], outputs=[cost], updates=((W1, W1 - learn_rate * gw1), (W2, W2 - learn_rate * gw2), (W3, W3 - learn_rate * gw3), (U, U - learn_rate * gu))) #f2 = theano.function(inputs=[x, y], outputs=cost) prediction = T.argmax(L1, axis=1) discrepancy = prediction - y f3 = theano.function(inputs=[x, y], outputs=[discrepancy, prediction]) #f4 = theano.function(inputs=[x, y], outputs=gw_x) alpha = self.alpha log.info("Read-in the training and test data.") open_train = open(train_file, "r") train_lines = open_train.readlines() open_test = open(test_file, "r") test_lines = open_test.readlines() log.info("Start training.") counter = 0 start = time.time() iter_ = epochs for j in range(0, iter_): log.info("Epoch: {}...".format(j + 1)) x_ = [] y_ = [] for i in range(len(train_lines)): if i % 1000 == 0: log.debug(i) counter = counter + 1 current_alpha = alpha * (iter_ * len(train_lines) - counter) / (iter_ * len(train_lines)) if current_alpha < 0.01: current_alpha = 0.01 line_ = train_lines[i] G = line_.split("|") token_line = G[0] label_line = G[1] token_list = list(fromstring(token_line, dtype=int, sep=' ')) x_ = self.contextwin(token_list) # len(l) by win*dimension y_ = fromstring(label_line, dtype=int, sep=' ') f1(x_, y_, current_alpha) total_num = 0 total_value = 0 goldlabels = [] predictions = [] goldlabels2 = [] predictions2 = [] for i in range(len(test_lines)): line_ = test_lines[i] G = line_.split("|") token_line = G[0].strip() label_line = G[1].strip() y = fromstring(label_line, dtype=int, sep=' ') x = self.contextwin( list(fromstring(token_line, dtype=int, sep=' '))) total_num = total_num + x.shape[0] discrep, preds = f3(x, y) goldlabels.extend(list(y)) goldlabels2.append(list(y)) predictions.extend(list(preds)) predictions2.append(list(preds)) total_value = total_value + x.shape[0] - count_nonzero(discrep) assert len(goldlabels) == len(predictions) # write out for evaluation with conlleval t_idx = load_json(tag_vocab_file) inv_t_idx = {i: t for t, i in t_idx.items()} with open("out", "w") as out: for gs, ps in zip(goldlabels2, predictions2): for g, p in zip(gs, ps): out.write("_ _ {} {}\n".format(inv_t_idx[g], inv_t_idx[p])) out.write("\n") log.info("f1 {}".format( f1_score(goldlabels, predictions, average="weighted"))) acc = 1.00 * total_value / total_num log.info("acc " + str(acc)) log.info("Training completed: {}s/epoch".format( (time.time() - start) / iter_))
def load(self, load_dir): self.w_index = load_json("{}/w_index.json".format(load_dir)) self.inv_w_index = {i: w for w, i in self.w_index.items()} if os.path.isfile("{}/w_cn.json".format(load_dir)): self.w_cn = load_json("{}/w_cn.json".format(load_dir))