def run_senseinfer(mbatch_size): """ Obtain sense predictions for every word in the text. """ start = time.time() log.info("Starting inference.") pivots = [] contexts = [] labels = [] max_y_is_1s = [] cur_mbatch_size = 0 all_ss = [] for i, (p, cs, ls, max_y_is_1, n_line) in enumerate(mbatch_skipgrams_inference(line_reader(args.corpus_path), v, max_window_size=args.max_window_size), 1): pivots.append(p) contexts.append(cs) labels.append(ls) max_y_is_1s.append(max_y_is_1) cur_mbatch_size += 1 if cur_mbatch_size != mbatch_size: continue if pivots: x_p = np.array(pivots, dtype="int32") # n_batches X_c = np.array(contexts, dtype="int32") # n_batches*n_contexts L = np.array(labels, dtype="int32") # n_batches*n_contexts max_y = np.array(max_y_is_1s, dtype="int32") # n_batches #M_pad = X_c > 0 # mask for padded assert X_c.shape[1] == L.shape[1] assert X_c.shape[0] == L.shape[0] == x_p.shape[0] == len(max_y) #== M_pad.shape[0] p, ss = modl.train(x_p, X_c, L, max_y) # n_batches all_ss.extend(ss) cur_mbatch_size = 0 pivots = [] contexts = [] labels = [] max_y_is_1s = [] if i % 10000 == 0: rep = "Number of instances: {0}; sequences: {1}".format(i, n_line) dyn_print(rep) # deal with out-of-mbatch if pivots: x_p = np.array(pivots, dtype="int32") # n_batches X_c = np.array(contexts, dtype="int32") # n_batches*n_contexts L = np.array(labels, dtype="int32") # n_batches*n_contexts max_y = np.array(max_y_is_1s, dtype="int32") # n_batches #M_pad = X_c > 0 # mask for padded assert X_c.shape[1] == L.shape[1] assert X_c.shape[0] == L.shape[0] == x_p.shape[0] == len(max_y) #== M_pad.shape[0] p, ss = modl.train(x_p, X_c, L, max_y) # n_batches all_ss.extend(ss) rep = "Number of instances: {0}; sequences: {1}".format(i, n_line) dyn_print(rep) print("\n") log.info("Inference completed: {}s".format(time.time()-start)) return all_ss
def save(self, save_dir): log.info("Saving vocabulary.") if not os.path.exists(save_dir): os.makedirs(save_dir) save_json(self.w_index, "{}/w_index.json".format(save_dir)) save_json(self.w_cn, "{}/w_cn.json".format(save_dir))
def run(mbatch_size, S_inferred): def update(): """ :return: updated counts of senses """ # pivots can repeat, intermediate updates need to be cached if modl_type == SensesInference: np.add.at(S_inferred, (p, ss), 1) elif modl_type == SensesExpectationInference: np.add.at(S_inferred, p, ss) else: sys.exit("Wrong inference type.") start = time.time() log.info("Starting inference.") pivots = [] contexts = [] labels = [] max_y_is_1s = [] cur_mbatch_size = 0 for i, (p, cs, ls, max_y_is_1, n_line) in enumerate(mbatch_skipgrams_inference(line_reader(args.corpus_path), v, max_window_size=args.max_window_size), 1): pivots.append(p) contexts.append(cs) labels.append(ls) max_y_is_1s.append(max_y_is_1) cur_mbatch_size += 1 if cur_mbatch_size != mbatch_size: continue if pivots: x_p = np.array(pivots, dtype="int32") # n_batches X_c = np.array(contexts, dtype="int32") # n_batches*n_contexts L = np.array(labels, dtype="int32") # n_batches*n_contexts max_y = np.array(max_y_is_1s, dtype="int32") # n_batches #M_pad = X_c > 0 # mask for padded assert X_c.shape[1] == L.shape[1] assert X_c.shape[0] == L.shape[0] == x_p.shape[0] == len(max_y) #== M_pad.shape[0] #if args.model == "senses" or args.model == "senses_expect": # p, ss = modl.train(x_p, X_c, L, max_y) # n_batches update() cur_mbatch_size = 0 pivots = [] contexts = [] labels = [] max_y_is_1s = [] if i % 10000 == 0: rep = "Number of instances: {0}; sequences: {1}".format(i, n_line) dyn_print(rep) print("\n") log.info("Inference completed: {}s".format(time.time()-start)) return S_inferred
def run_fastskipgram(nepochs, mbatch_size): start = time.time() total_eff_len = 0 for e in range(nepochs): log.info("Epoch {}.".format(e)) losses = [] n_instances = 0 cur_mbatch_size = 0 pairs = [] labels = [] for i, line in enumerate(line_reader(args.corpus_path), 1): ps, ls, eff_len = skipgrams(v.line_to_seq(line), max_window_size=args.max_window_size, n_neg=args.n_neg, sampling_tab=s_tab, subsampling_tab=ss_tab) pairs.extend(ps) labels.extend(ls) cur_mbatch_size += 1 total_eff_len += eff_len if cur_mbatch_size != mbatch_size: continue else: cur_mbatch_size = 0 #if args.optimizer == "SGDDynamic": # lrate = max(args.min_lr, args.lr * (1 - total_eff_len / total_len)) #else: lrate = args.lr if pairs: X = np.array(pairs, dtype="int32") labs = np.array(labels).reshape(-1, 1) #loss = sg.train(X, labs, lrate) loss = modl.train(X, labs) losses.append(loss) n_instances += len(labels) pairs = [] labels = [] if i % 10000 == 0: if not w1 or not w2: sc = "-" else: sc = sim(modl.params[0].get_value()[w1], modl.params[0].get_value()[w2]) rep = "Number of instances: {0}; sequences: {1}; Fr/Ge pivot cos:{2}".format(n_instances, i, sc) if len(losses) > 0: rep = "{} {}".format(rep, "Mean loss over last {} seqs: {}.".format(len(losses) * mbatch_size, np.mean(losses))) losses = [] dyn_print(rep) print("\n") log.info("Training completed: {}s/epoch".format((time.time() - start) / nepochs))
def write_senses(sense_preds): """ Write sense predictions by appending to words, like in 'the/1' (here, second sense is assigned, counting from 0). """ corpus_path_senses = "{}.senses".format(args.corpus_path) log.info("Writing to {}.".format(corpus_path_senses)) p_count = 0 with open(corpus_path_senses, "w") as outfile: for l in line_reader(args.corpus_path): new_l = [] for w in l.strip().split(" "): new_l.append("{0}/{1}".format(w, sense_preds[p_count])) p_count += 1 outfile.write(" ".join(new_l)+"\n")
e += entropy(v) return e / len(frequent) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-input_dir", help="Directory containing model and vocabulary files.", required=True) parser.add_argument("-f", default="W_w", help="Model file (optional), meant for models per epoch.") parser.add_argument("-data_path", help="Filepath containing the SCWS dataset.", default="data/SCWS/ratings.txt") parser.add_argument("-win_size", default=3, type=int, help="Context window size (n words to the left and n to the right).") parser.add_argument("-n_most_freq", type=int, help="Only consider n most freq. words from vocabulary.") args = parser.parse_args() w_index_path = "{}/w_index.json".format(args.input_dir) # model_path = "{}/sg.pickle".format(args.input_dir) log.info("Loading model.") w_index = load_json(w_index_path) if args.n_most_freq: w_index = {w: i for w, i in w_index.items() if i < args.n_most_freq + 1} print(len(w_index)) embs = load_npy("{}/{}.npy".format(args.input_dir, args.f)) c_embs = load_npy("{}/W_c.npy".format(args.input_dir)) try: if args.f == "W_w": n = "" else: n = eval(args.f[-1]) assert 0 <= n < 9 bias = load_npy("{}/Wb{}.npy".format(args.input_dir, n)) except FileNotFoundError:
def Sequence_Level(self, train_file, test_file, num_label, epochs): log.debug("Declaring theano vars.") random.seed(5) W1 = theano.shared(0.2 * random.random([self.win * self.dimension, self.hidden1]) - 0.1) W2 = theano.shared(0.2 * random.random([self.hidden1, self.hidden2]) - 0.1) W3 = theano.shared(0.2 * random.random([self.hidden2, self.hidden3]) - 0.1) U = theano.shared(0.2 * random.random([self.hidden3, num_label]) - 0.1) x = T.dmatrix("x") # len(l) by win*dimension y = T.lvector("y") learn_rate = T.scalar("learn_rate") A1 = T.dot(x, W1) B1 = A1 * (A1 > 0) A2 = T.dot(B1, W2) B2 = A2 * (A2 > 0) A3 = T.dot(B2, W3) B3 = A3 * (A3 > 0) G = T.dot(B3, U) L1 = T.nnet.softmax(G) # len(l) by num_label #L1=T.nnet.softmax(T.dot(T.tanh(T.dot(T.tanh(T.dot(T.tanh(T.dot(x,W1)),W2)),W3)),U)) cost = T.nnet.categorical_crossentropy(L1, y).mean() gw1, gw2, gw3, gu = T.grad(cost, [W1, W2, W3, U]) #gw_x = T.grad(cost, [x]) log.info("Compiling theano model.") f1 = theano.function(inputs=[x, y, learn_rate], outputs=[cost], updates=( (W1, W1 - learn_rate * gw1), (W2, W2 - learn_rate * gw2), (W3, W3 - learn_rate * gw3), (U, U - learn_rate * gu))) #f2 = theano.function(inputs=[x, y], outputs=cost) prediction = T.argmax(L1, axis=1) discrepancy = prediction - y f3 = theano.function(inputs=[x, y], outputs=[discrepancy,prediction]) #f4 = theano.function(inputs=[x, y], outputs=gw_x) alpha = self.alpha log.info("Read-in the training and test data.") open_train = open(train_file, "r") train_lines = open_train.readlines() open_test = open(test_file, "r") test_lines = open_test.readlines() log.info("Start training.") counter = 0 start = time.time() iter_ = epochs for j in range(0, iter_): log.info("Epoch: {}...".format(j+1)) x_ = [] y_ = [] for i in range(len(train_lines)): if i % 1000 == 0: log.debug(i) counter = counter + 1 current_alpha = alpha * (iter_ * len(train_lines) - counter) / (iter_ * len(train_lines)) if current_alpha < 0.01: current_alpha = 0.01 line_ = train_lines[i] G = line_.split("|") token_line = G[0] label_line = G[1] token_list = list(fromstring(token_line, dtype=int, sep=' ')) x_ = self.contextwin(token_list) # len(l) by win*dimension y_ = fromstring(label_line, dtype=int, sep=' ') f1(x_, y_, current_alpha) total_num = 0 total_value = 0 goldlabels = [] predictions = [] for i in range(len(test_lines)): line_ = test_lines[i] G = line_.split("|") token_line = G[0].strip() label_line = G[1].strip() y = fromstring(label_line, dtype=int, sep=' ') x = self.contextwin(list(fromstring(token_line, dtype=int, sep=' '))) total_num = total_num + x.shape[0] discrep, preds = f3(x, y) goldlabels.extend(list(y)) predictions.extend(list(preds)) total_value = total_value + x.shape[0] - count_nonzero(discrep) assert len(goldlabels) == len(predictions) log.info("f1 {}".format(f1_score(goldlabels, predictions, average="weighted"))) acc = 1.00 * total_value / total_num log.info("acc " + str(acc)) log.info("Training completed: {}s/epoch".format((time.time()-start)/iter_))
parser.add_argument("-hidden3", type=int, default=50) parser.add_argument("-oov", type=int, default=0) parser.add_argument("-tag_vocab_file", default="../../../Datasets/wsj/wsjtrain.tagvocab.json") parser.add_argument("-test_file", default="../../../Datasets/wsj/wsjtest") parser.add_argument("-train_file", default="../../../Datasets/wsj/wsjtrain") parser.add_argument("-vocab_file", required=True, help="Either in indexed json format or one word/line format") parser.add_argument("-vocab_limit", type=int, default=1000000) parser.add_argument("-vocab_limit_noexp", action="store_true", help="Will use simple avg instead of weighted avg. for all words that exceed the args.vocab_limit (ie outside of vocab_limit-most frequent words.)") parser.add_argument("-win", type=int, default=7) parser.add_argument("-infer_side_win", type=int, default=3, help="Window size to each side during sense inference step") args = parser.parse_args() log.info("Settings: {}".format(args)) log.info("Loading embeddings.") # context-sensitive inference for multisense embeddings if args.cembedding_file is not None: A = ScoreExpEmbedding(args.embedding_file, args.vocab_file, oov=args.oov, hidden1=args.hidden1, hidden2=args.hidden2, hidden3=args.hidden3, win=args.win, alpha=args.alpha, cembedding_file=args.cembedding_file, infer_side_win=args.infer_side_win) else: A = ScoreEmbedding(args.embedding_file, args.vocab_file, oov=args.oov, hidden1=args.hidden1, hidden2=args.hidden2, hidden3=args.hidden3, win=args.win, alpha=args.alpha) # use 0th, <s> symbol as oov log.info("Preparing training and test data.") train_file, len_tag_vocab = process(args.train_file, args.train_file+".bar", args.vocab_file, args.tag_vocab_file, args.vocab_limit) test_file, _ = process(args.test_file, args.test_file+".bar", args.vocab_file, args.tag_vocab_file, args.vocab_limit)
def run(nepochs, mbatch_size): start = time.time() for e in range(nepochs): log.info("Epoch {}.".format(e)) losses = [] cur_mbatch_size = 0 print_size = 0 pivots = [] contexts = [] contexts_a = [] # affiliated contexts labels = [] max_y_is_1s = [] for i, (p, cs, cs_a, ls, max_y_is_1, n_line) in enumerate( mbatch_skipgrams_affil( line_reader(args.corpus_path_e), line_reader(args.corpus_path_f), line_reader(args.corpus_path_a), v_e, v_f, max_window_size=args.max_window_size, max_window_size_f=args.max_window_size_f, n_neg=args.n_neg, sampling_tab=s_tab, subsampling_tab=ss_tab, leaveout_m0=args.leaveout_m0), 1): if np.all(np.array(ls) == 0): continue pivots.append(p) contexts.append(cs) contexts_a.append(cs_a) labels.append(ls) max_y_is_1s.append(max_y_is_1) cur_mbatch_size += 1 if cur_mbatch_size != mbatch_size: continue if pivots: x_p = np.array(pivots, dtype="int32") # n_batches X_c = np.array(contexts, dtype="int32") # n_batches*n_contexts # l' contexts: X_c_f = np.array(contexts_a, dtype="int32") # n_batches*n_contexts_f X_c_f_mask = X_c_f > 0 L = np.array(labels, dtype="int32") # n_batches*n_contexts max_y = np.array(max_y_is_1s, dtype="int32") # n_batches M_pad = X_c > 0 # mask for padded assert X_c.shape[1] == L.shape[1] assert X_c.shape[0] == L.shape[0] == x_p.shape[0] == len( max_y) == M_pad.shape[0] loss = modl.train(x_p, X_c, X_c_f, X_c_f_mask, L, max_y, M_pad) losses.append(loss) pivots = [] contexts = [] contexts_a = [] labels = [] max_y_is_1s = [] print_size += cur_mbatch_size if print_size % 10000 == 0: if not w1 or not w2: sc = "-" else: sc = sim(modl.params[0].get_value()[w1], modl.params[0].get_value()[w2]) rep = "Number of instances: {0}; sequences: {1}; Fr/Ge pivot cos:{2}".format( print_size, n_line, sc) if len(losses) > 0: rep = "{} {}".format( rep, "Mean loss {}.".format(np.mean(losses))) losses = [] dyn_print(rep) cur_mbatch_size = 0 print("\n") if args.save_model_per_ep and not e == nepochs - 1: # save of last epoch done below save_weights(modl, str_app=str(e)) log.info("Training completed: {}s/epoch".format( (time.time() - start) / nepochs))
rep = "{} {}".format( rep, "Mean loss {}.".format(np.mean(losses))) losses = [] dyn_print(rep) cur_mbatch_size = 0 print("\n") if args.save_model_per_ep and not e == nepochs - 1: # save of last epoch done below save_weights(modl, str_app=str(e)) log.info("Training completed: {}s/epoch".format( (time.time() - start) / nepochs)) if not os.path.exists(save_dir): os.makedirs(save_dir) log.info("Saving run details to {}.".format(save_dir)) save_run_details() log.info("Building vocabulary.") v_e = VocabBuild(min_freq=args.min_freq, max_n_words=args.max_n_words, discard_n_top_freq=args.discard_n_top_freq, downcase=args.downcase) r = line_reader(args.corpus_path_e) v_e.create(r) log.info("Loading l' vocabulary.") sett_f = load_json("{}/{}".format(args.model_f_dir, args.settings_save_fname)) v_f = VocabBuild(sett_f["min_freq"], sett_f["max_n_words"], sett_f["discard_n_top_freq"], sett_f["downcase"]) v_f.load(args.model_f_dir)
def run(nepochs, mbatch_size): start = time.time() for e in range(nepochs): log.info("Epoch {}.".format(e)) losses = [] cur_mbatch_size = 0 print_size = 0 pivots = [] contexts = [] contexts_a = [] # affiliated contexts labels = [] max_y_is_1s = [] for i, (p, cs, cs_a, ls, max_y_is_1, n_line) in enumerate(mbatch_skipgrams_affil(line_reader(args.corpus_path_e), line_reader(args.corpus_path_f), line_reader(args.corpus_path_a), v_e, v_f, max_window_size=args.max_window_size, max_window_size_f=args.max_window_size_f, n_neg=args.n_neg, sampling_tab=s_tab, subsampling_tab=ss_tab, leaveout_m0=args.leaveout_m0), 1): if np.all(np.array(ls) == 0): continue pivots.append(p) contexts.append(cs) contexts_a.append(cs_a) labels.append(ls) max_y_is_1s.append(max_y_is_1) cur_mbatch_size += 1 if cur_mbatch_size != mbatch_size: continue if pivots: x_p = np.array(pivots, dtype="int32") # n_batches X_c = np.array(contexts, dtype="int32") # n_batches*n_contexts # l' contexts: X_c_f = np.array(contexts_a, dtype="int32") # n_batches*n_contexts_f X_c_f_mask = X_c_f > 0 L = np.array(labels, dtype="int32") # n_batches*n_contexts max_y = np.array(max_y_is_1s, dtype="int32") # n_batches M_pad = X_c > 0 # mask for padded assert X_c.shape[1] == L.shape[1] assert X_c.shape[0] == L.shape[0] == x_p.shape[0] == len(max_y) == M_pad.shape[0] loss = modl.train(x_p, X_c, X_c_f, X_c_f_mask, L, max_y, M_pad) losses.append(loss) pivots = [] contexts = [] contexts_a = [] labels = [] max_y_is_1s = [] print_size += cur_mbatch_size if print_size % 10000 == 0: if not w1 or not w2: sc = "-" else: sc = sim(modl.params[0].get_value()[w1], modl.params[0].get_value()[w2]) rep = "Number of instances: {0}; sequences: {1}; Fr/Ge pivot cos:{2}".format(print_size, n_line, sc) if len(losses) > 0: rep = "{} {}".format(rep, "Mean loss {}.".format(np.mean(losses))) losses = [] dyn_print(rep) cur_mbatch_size = 0 print("\n") if args.save_model_per_ep and not e == nepochs - 1: # save of last epoch done below save_weights(modl, str_app=str(e)) log.info("Training completed: {}s/epoch".format((time.time() - start) / nepochs))
sc) if len(losses) > 0: rep = "{} {}".format(rep, "Mean loss {}.".format(np.mean(losses))) losses = [] dyn_print(rep) cur_mbatch_size = 0 print("\n") if args.save_model_per_ep and not e == nepochs - 1: # save of last epoch done below save_weights(modl, str_app=str(e)) log.info("Training completed: {}s/epoch".format((time.time() - start) / nepochs)) if not os.path.exists(save_dir): os.makedirs(save_dir) log.info("Saving run details to {}.".format(save_dir)) save_run_details() log.info("Building vocabulary.") v_e = VocabBuild(min_freq=args.min_freq, max_n_words=args.max_n_words, discard_n_top_freq=args.discard_n_top_freq, downcase=args.downcase) r = line_reader(args.corpus_path_e) v_e.create(r) log.info("Loading l' vocabulary.") sett_f = load_json("{}/{}".format(args.model_f_dir, args.settings_save_fname)) v_f = VocabBuild(sett_f["min_freq"], sett_f["max_n_words"], sett_f["discard_n_top_freq"], sett_f["downcase"]) v_f.load(args.model_f_dir)
print("Loaded model.") #elif args.embs_format == "mssg": # Neelakantan et al.'s multisense embs # w_index, embs = load_mssg("{}/vectors-MSSGKMeans.gz".format(args.input_dir)) # if args.n_most_freq: # w_index = {w: i for w, i in w_index.items() if i < args.n_most_freq + 1} # print(len(w_index)) # print("Loaded model.") elif args.embs_format == "bimu": try: w_index_path = "{}/w_index.json".format(args.input_dir) w_index = load_json(w_index_path) # model_path = "{}/sg.pickle".format(args.input_dir) except FileNotFoundError: w_index_path = "{}/W_v.txt".format(args.input_dir) w_index = {l.strip(): c for c, l in enumerate(line_reader(w_index_path))} log.info("Loading model.") if args.n_most_freq: w_index = {w: i for w, i in w_index.items() if i < args.n_most_freq + 1} print(len(w_index)) if args.model == "senses": embs = load_npy("{}/{}.npy".format(args.input_dir, args.f)) if args.sim == "local" or args.sim == "avg_exp": if args.f == "W_w": n = "" else: n = eval(args.f[-1]) assert 0 <= n < 9 c_embs = load_npy("{}/W_c{}.npy".format(args.input_dir, n)) try: if args.f == "W_w":
Write sense predictions by appending to words, like in 'the/1' (here, second sense is assigned, counting from 0). """ corpus_path_senses = "{}.senses".format(args.corpus_path) log.info("Writing to {}.".format(corpus_path_senses)) p_count = 0 with open(corpus_path_senses, "w") as outfile: for l in line_reader(args.corpus_path): new_l = [] for w in l.strip().split(" "): new_l.append("{0}/{1}".format(w, sense_preds[p_count])) p_count += 1 outfile.write(" ".join(new_l)+"\n") log.info("Saving run details to {}.".format("{}/{}".format(args.model_load_dir, args.settings_save_fname))) save_run_details() log.info("Building vocabulary.") v = VocabBuild(min_freq=args.min_freq, max_n_words=args.max_n_words, discard_n_top_freq=args.discard_n_top_freq, downcase=args.downcase) v.load(args.model_load_dir) #v.save(save_dir) W_w = load_npy("{}/{}.npy".format(args.model_load_dir, "W_w")) W_c = load_npy("{}/{}.npy".format(args.model_load_dir, "W_c")) if args.infer_type == "argmax": modl_type = SensesInference elif args.infer_type == "expect": modl_type = SensesExpectationInference
update() cur_mbatch_size = 0 pivots = [] contexts = [] labels = [] max_y_is_1s = [] if i % 10000 == 0: rep = "Number of instances: {0}; sequences: {1}".format(i, n_line) dyn_print(rep) print("\n") log.info("Inference completed: {}s".format(time.time()-start)) return S_inferred log.info("Saving run details to {}.".format("{}/{}".format(args.model_load_dir, args.settings_save_fname))) save_run_details() log.info("Building vocabulary.") v = VocabBuild(min_freq=args.min_freq, max_n_words=args.max_n_words, discard_n_top_freq=args.discard_n_top_freq, downcase=args.downcase) v.load(args.model_load_dir) #v.save(save_dir) W_w = load_npy("{}/{}.npy".format(args.model_load_dir, "W_w")) W_c = load_npy("{}/{}.npy".format(args.model_load_dir, "W_c")) if args.infer_type == "argmax": modl_type = SensesInference elif args.infer_type == "expect": modl_type = SensesExpectationInference