def hypertune(train, dev, emb_path, obj, hyperparams, res_path=None): with open(train, 'rb') as fid: X_train, Y_train, vocabulary, _ = cPickle.load(fid) with open(dev, 'rb') as fid: X_dev, Y_dev, _, _ = cPickle.load(fid) E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary) best_hp = None best_score = 0 for hp in hyperparams: #initialize model with the hyperparameters nn = nlse.NLSE(E, **hp) nn.fit(X_train, Y_train, X_dev, Y_dev, silent=False) Y_hat = nn.predict(X_dev) score = obj(Y_dev, Y_hat) print "[score: {} | hyperparameters: {}]".format(score, repr(hp)) if score > best_score: best_score = score best_hp = hp results = {"score": round(score, 3), "hyper": repr(hp)} if res_path is not None: helpers.save_results(results, res_path, sep="\t") helpers.print_results(results) print "" print "[best conf: {} | score: {}]".format(repr(best_hp), best_score) return best_hp, best_score
def main(train, dev, test, emb_path, hyperparams, run_id=None, res_path=None): with open(train, 'rb') as fid: X_train, Y_train, vocabulary, _ = cPickle.load(fid) with open(dev, 'rb') as fid: X_dev, Y_dev, _, _ = cPickle.load(fid) with open(test, 'rb') as fid: test_x, test_y, _, _ = cPickle.load(fid) E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary) nn = nlse.NLSE(E, **hyperparams) nn.fit(X_train, Y_train, X_dev, Y_dev) y_hat = nn.predict(test_x) avgF1 = f1_score(test_y, y_hat, average="macro") acc = accuracy_score(test_y, y_hat) run_id = run_id dataset = os.path.basename(test) hp = {p: hyperparams[p] for p in ["sub_size", "lrate"]} if run_id is None: run_id = "NLSE" results = {"acc":round(acc,3), \ "avgF1":round(avgF1,3), \ "model":"NLSE", \ "dataset":dataset, \ "run_id":run_id, "sub_size":hyperparams["sub_size"], "lrate":hyperparams["lrate"]} cols = ["dataset", "model", "run_id", "acc", "avgF1", "sub_size"] helpers.print_results( results, columns=["dataset", "run_id", "lrate", "subsize", "acc", "avgF1"]) if res_path is not None: helpers.save_results(results, res_path, sep="\t", columns=cols) return results
def hypertuner(train, dev, test, emb_path, obj, hyperparams, run_id, res_path=None, model_path=None): with open(train, 'rb') as fid: X_train, Y_train, vocabulary = cPickle.load(fid) with open(dev, 'rb') as fid: X_dev, Y_dev, _ = cPickle.load(fid) with open(test, 'rb') as fid: X_test, Y_test, _ = cPickle.load(fid) E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary) Ys = Y_train + Y_dev + Y_test label_map = vectorizer.build_vocabulary(Ys) Y_train = [label_map[y] for y in Y_train] Y_test = [label_map[y] for y in Y_test] Y_dev = [label_map[y] for y in Y_dev] dataset = os.path.basename(test) best_hp = None best_score = 0 best_results = None for hp in hyperparams: #initialize model with the hyperparameters nn = nlse.NLSE(E, label_map=label_map, vocab=vocabulary, **hp) # nn = nlse.NLSE(E, **hp) nn.fit(X_train, Y_train, X_dev, Y_dev, silent=False) Y_hat = nn.predict(X_test) score = obj(Y_test, Y_hat) print "[score: {} | hyperparameters: {}]".format(score, repr(hp)) if score > best_score: if model_path is not None: nn.save(model_path) best_score = score best_hp = hp acc = accuracy_score(Y_test, Y_hat) avgF1 = f1_score(Y_test, Y_hat, average="macro") rep_hp = {p: hp[p] for p in ["sub_size", "lrate"]} best_results = {"acc":round(acc,3), \ "avgF1":round(avgF1,3), \ "model":"NLSE", \ "dataset":dataset, \ "run_id":run_id, "hyper":repr(rep_hp)} res = {"score": round(score, 3), "hyper": repr(hp)} helpers.print_results(res) if res_path is not None: cols = ["dataset", "model", "run_id", "acc", "avgF1"] helpers.save_results(best_results, res_path, cols, sep="\t") print "" print "[best conf: {} | score: {}]".format(repr(best_hp), best_score) return best_hp, best_score
def run(inputs, opts): for dataset in inputs: print "[extracting features @ {}]".format(repr(dataset)) E = None with open(dataset, "rb") as fid: X, Y, vocabulary = cPickle.load(fid) basename = os.path.splitext(os.path.basename(dataset))[0] if opts.bow is not None: for agg in opts.bow: fname = basename + "-bow-" + agg.lower() print "\t > BOW ({})".format(fname) if agg == "bin": bow = features.BOW(X, vocabulary, opts.sparse_bow) elif agg == "freq": bow = features.BOW_freq(X, vocabulary, opts.sparse_bow) np.save(opts.out_folder + fname, bow) if opts.boe is not None: for agg in opts.boe: fname = basename + "-boe-" + agg.lower() print "\t > BOE ({})".format(fname) E, _ = embeddings.read_embeddings(opts.embeddings, wrd2idx=vocabulary) boe = features.BOE(X, E, agg=agg) np.save(opts.out_folder + fname, boe) if opts.w2v: fname = basename + "-w2v" print "\t > W2V ({})".format(fname) E, _ = embeddings.read_embeddings(opts.embeddings, wrd2idx=vocabulary) emb = features.BOE(X, E, agg="bin") np.save(opts.out_folder + fname, emb) if opts.u2v: fname = basename + "-u2v" print "\t > u2v ({})".format(fname) E, _ = embeddings.read_embeddings(opts.embeddings, wrd2idx=vocabulary) emb = features.BOE(X, E, agg="bin") np.save(opts.out_folder + fname, emb) if opts.nlse: fname = basename + "_NLSE.pkl" E, _ = embeddings.read_embeddings(opts.embeddings, wrd2idx=vocabulary) np.save(fname, E)
def main(train, dev, test, emb_path, hyperparams, run_id=None, res_path=None, no_hidden=False): with open(train, 'rb') as fid: X_train, Y_train, vocabulary = cPickle.load(fid) with open(dev, 'rb') as fid: X_dev, Y_dev, _ = cPickle.load(fid) with open(test, 'rb') as fid: X_test, Y_test, _ = cPickle.load(fid) E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary) Ys = Y_train + Y_dev + Y_test label_map = vectorizer.build_vocabulary(Ys) Y_train = [label_map[y] for y in Y_train] Y_test = [label_map[y] for y in Y_test] Y_dev = [label_map[y] for y in Y_dev] print "[no hidden: {}]".format(no_hidden) # set_trace() if no_hidden: del hyperparams["sub_size"] nn = nlse.BOE_plus(E, label_map=label_map, vocab=vocabulary, **hyperparams) else: nn = nlse.NLSE(E, label_map=label_map, vocab=vocabulary, **hyperparams) nn.fit(X_train, Y_train, X_dev, Y_dev) y_hat = nn.predict(X_test) avgF1 = f1_score(Y_test, y_hat, average="macro") acc = accuracy_score(Y_test, y_hat) run_id = run_id dataset = os.path.basename(test) # hp = {p:hyperparams[p] for p in ["sub_size","lrate"]} if run_id is None: run_id = "NLSE" results = {"acc":round(acc,3), \ "avgF1":round(avgF1,3), \ "model":"NLSE", \ "dataset":dataset, \ "run_id":run_id} helpers.print_results(results, columns=["dataset", "run_id", "acc", "avgF1"]) if res_path is not None: cols = ["dataset", "model", "run_id", "acc", "avgF1"] helpers.save_results(results, res_path, sep="\t", columns=cols) return results, nn
t0 = time.time() word_counter = Counter() n_docs=0 with open(args.input,"r") as fid: for line in fid: message = line.decode("utf-8").split()[1:] word_counter.update(message) n_docs+=1 #keep only words that occur at least min_word_freq times wc = {w:c for w,c in word_counter.items() if c>args.min_word_freq} #keep only the args.vocab_size most frequent words tw = sorted(wc.items(), key=lambda x:x[1],reverse=True) top_words = {w[0]:i for i,w in enumerate(tw[:args.vocab_size])} print "loading word embeddings..." full_E, full_wrd2idx = emb_utils.read_embeddings(args.emb,top_words) ooevs = emb_utils.get_OOEVs(full_E, full_wrd2idx) #keep only words with pre-trained embeddings old_len = len(top_words) for w in ooevs: del top_words[w] wrd2idx = {w:i for i,w in enumerate(top_words.keys())} print "[vocabulary size: %d|%d]" % (len(wrd2idx),old_len) #generate the embedding matrix emb_size = full_E.shape[0] E = np.zeros((int(emb_size), len(wrd2idx))) for wrd,idx in wrd2idx.items(): E[:, idx] = full_E[:,top_words[wrd]] print "building training data..." #negative sampler