def main(train, dev, test, emb_path, hyperparams, run_id=None, res_path=None): with open(train, 'rb') as fid: X_train, Y_train, vocabulary, _ = cPickle.load(fid) with open(dev, 'rb') as fid: X_dev, Y_dev, _, _ = cPickle.load(fid) with open(test, 'rb') as fid: test_x, test_y, _, _ = cPickle.load(fid) E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary) nn = nlse.NLSE(E, **hyperparams) nn.fit(X_train, Y_train, X_dev, Y_dev) y_hat = nn.predict(test_x) avgF1 = f1_score(test_y, y_hat, average="macro") acc = accuracy_score(test_y, y_hat) run_id = run_id dataset = os.path.basename(test) hp = {p: hyperparams[p] for p in ["sub_size", "lrate"]} if run_id is None: run_id = "NLSE" results = {"acc":round(acc,3), \ "avgF1":round(avgF1,3), \ "model":"NLSE", \ "dataset":dataset, \ "run_id":run_id, "sub_size":hyperparams["sub_size"], "lrate":hyperparams["lrate"]} cols = ["dataset", "model", "run_id", "acc", "avgF1", "sub_size"] helpers.print_results( results, columns=["dataset", "run_id", "lrate", "subsize", "acc", "avgF1"]) if res_path is not None: helpers.save_results(results, res_path, sep="\t", columns=cols) return results
def hypertune(train, dev, emb_path, obj, hyperparams, res_path=None): with open(train, 'rb') as fid: X_train, Y_train, vocabulary, _ = cPickle.load(fid) with open(dev, 'rb') as fid: X_dev, Y_dev, _, _ = cPickle.load(fid) E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary) best_hp = None best_score = 0 for hp in hyperparams: #initialize model with the hyperparameters nn = nlse.NLSE(E, **hp) nn.fit(X_train, Y_train, X_dev, Y_dev, silent=False) Y_hat = nn.predict(X_dev) score = obj(Y_dev, Y_hat) print "[score: {} | hyperparameters: {}]".format(score, repr(hp)) if score > best_score: best_score = score best_hp = hp results = {"score": round(score, 3), "hyper": repr(hp)} if res_path is not None: helpers.save_results(results, res_path, sep="\t") helpers.print_results(results) print "" print "[best conf: {} | score: {}]".format(repr(best_hp), best_score) return best_hp, best_score
def main(lex_path, test, label_map, run_id, conf={}, dev=None, res_path=None): #read test data dt = read_dataset(test, labels=label_map.keys()) X_test = [x[1] for x in dt] Y_test = [label_map[x[0]] for x in dt] #model model = LexiconSentiment(path=lex_path,**conf) #if dev data is passed, use this data to fit the threshold if dev is not None: dt_dev = read_dataset(dev, labels=label_map.keys()) X_dev = [x[1] for x in dt_dev] Y_dev = [label_map[x[0]] for x in dt_dev] print "[fitting]" model.fit(X_dev,Y_dev,samples=SAMPLEZ,silent=True) conf = model.get_params() #test model Y_hat = model.predict(X_test) avgF1 = f1_score(Y_test, Y_hat,average="macro") acc = accuracy_score(Y_test, Y_hat) results = {"acc":round(acc,3), "avgF1":round(avgF1,3), "model":run_id, "dataset":os.path.basename(test), "run_id":run_id } results.update(conf) cols = ["dataset", "model", "run_id", "acc", "avgF1"] helpers.print_results(results, columns=cols) if res_path is not None: #cols+=["positive_threshold","keep_scores_below","keep_scores_above"] helpers.save_results(results, res_path, sep="\t", columns=cols) return results
def hypertune(lex_path, dev, label_map, obj, hyperparams, res_path=None): dt = read_dataset(dev, labels=label_map.keys()) X = [x[1] for x in dt] Y = [label_map[x[0]] for x in dt] best_hp = hyperparams[0] best_score = 0 for hp in hyperparams: #initialize model with the hyperparameters model = LexiconSentiment(path=lex_path,**hp) model.fit(X,Y,samples=SAMPLEZ,silent=True) Y_hat = model.predict(X) score = obj(Y, Y_hat) if score > best_score: #replace the original configuration with the one returned by the model, which #contains also the fitted parameters hp = model.get_params() best_score = score best_hp = hp p_hp = {k:hp[k] for k in ["keep_scores_below","keep_scores_above","positive_threshold"]} print "[hyperparameters: {} | score: {} ]".format(repr(p_hp), round(score,3)) # set_trace() results = {"score":round(score,3), "hyper":repr(p_hp)} if res_path is not None: helpers.save_results(results,res_path, sep="\t") # helpers.print_results(results) print "" print "[best conf: {} | score: {}]".format(repr(best_hp),best_score) return best_hp, best_score
def hypertuner(train, dev, test, emb_path, obj, hyperparams, run_id, res_path=None, model_path=None): with open(train, 'rb') as fid: X_train, Y_train, vocabulary = cPickle.load(fid) with open(dev, 'rb') as fid: X_dev, Y_dev, _ = cPickle.load(fid) with open(test, 'rb') as fid: X_test, Y_test, _ = cPickle.load(fid) E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary) Ys = Y_train + Y_dev + Y_test label_map = vectorizer.build_vocabulary(Ys) Y_train = [label_map[y] for y in Y_train] Y_test = [label_map[y] for y in Y_test] Y_dev = [label_map[y] for y in Y_dev] dataset = os.path.basename(test) best_hp = None best_score = 0 best_results = None for hp in hyperparams: #initialize model with the hyperparameters nn = nlse.NLSE(E, label_map=label_map, vocab=vocabulary, **hp) # nn = nlse.NLSE(E, **hp) nn.fit(X_train, Y_train, X_dev, Y_dev, silent=False) Y_hat = nn.predict(X_test) score = obj(Y_test, Y_hat) print "[score: {} | hyperparameters: {}]".format(score, repr(hp)) if score > best_score: if model_path is not None: nn.save(model_path) best_score = score best_hp = hp acc = accuracy_score(Y_test, Y_hat) avgF1 = f1_score(Y_test, Y_hat, average="macro") rep_hp = {p: hp[p] for p in ["sub_size", "lrate"]} best_results = {"acc":round(acc,3), \ "avgF1":round(avgF1,3), \ "model":"NLSE", \ "dataset":dataset, \ "run_id":run_id, "hyper":repr(rep_hp)} res = {"score": round(score, 3), "hyper": repr(hp)} helpers.print_results(res) if res_path is not None: cols = ["dataset", "model", "run_id", "acc", "avgF1"] helpers.save_results(best_results, res_path, cols, sep="\t") print "" print "[best conf: {} | score: {}]".format(repr(best_hp), best_score) return best_hp, best_score
def main(train, dev, test, emb_path, hyperparams, run_id=None, res_path=None, no_hidden=False): with open(train, 'rb') as fid: X_train, Y_train, vocabulary = cPickle.load(fid) with open(dev, 'rb') as fid: X_dev, Y_dev, _ = cPickle.load(fid) with open(test, 'rb') as fid: X_test, Y_test, _ = cPickle.load(fid) E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary) Ys = Y_train + Y_dev + Y_test label_map = vectorizer.build_vocabulary(Ys) Y_train = [label_map[y] for y in Y_train] Y_test = [label_map[y] for y in Y_test] Y_dev = [label_map[y] for y in Y_dev] print "[no hidden: {}]".format(no_hidden) # set_trace() if no_hidden: del hyperparams["sub_size"] nn = nlse.BOE_plus(E, label_map=label_map, vocab=vocabulary, **hyperparams) else: nn = nlse.NLSE(E, label_map=label_map, vocab=vocabulary, **hyperparams) nn.fit(X_train, Y_train, X_dev, Y_dev) y_hat = nn.predict(X_test) avgF1 = f1_score(Y_test, y_hat, average="macro") acc = accuracy_score(Y_test, y_hat) run_id = run_id dataset = os.path.basename(test) # hp = {p:hyperparams[p] for p in ["sub_size","lrate"]} if run_id is None: run_id = "NLSE" results = {"acc":round(acc,3), \ "avgF1":round(avgF1,3), \ "model":"NLSE", \ "dataset":dataset, \ "run_id":run_id} helpers.print_results(results, columns=["dataset", "run_id", "acc", "avgF1"]) if res_path is not None: cols = ["dataset", "model", "run_id", "acc", "avgF1"] helpers.save_results(results, res_path, sep="\t", columns=cols) return results, nn
def main(train, test, run_id, features, hyperparameters={}, res_path=None): #train and evalute model if features[0].lower() == "naive_bayes": X_train, Y_train = get_features(train, ["bow-bin"]) X_test, Y_test = get_features(test, ["bow-bin"]) model = BernoulliNB() model_name = "NaiveBayes" elif features[0].lower() == "mlp": X_train, Y_train = get_features(train, ["bow-bin"]) X_test, Y_test = get_features(test, ["bow-bin"]) model = MLPClassifier(solver='lbfgs', activation="logistic", hidden_layer_sizes=[400]) model_name = "MLP" elif features[0].lower() == "mlp-2": X_train, Y_train = get_features(train, ["bow-bin"]) X_test, Y_test = get_features(test, ["bow-bin"]) model = MLPClassifier(solver='lbfgs', activation="logistic", hidden_layer_sizes=[400, 100]) model_name = "MLP-2" else: X_train, Y_train = get_features(train, features) X_test, Y_test = get_features(test, features) #initialize model with the hyperparameters model = SGDClassifier(random_state=1234, **hyperparameters) model_name = "+".join(features) model.fit(X_train, Y_train) Y_hat = model.predict(X_test) avgF1 = f1_score(Y_test, Y_hat, average="macro") acc = accuracy_score(Y_test, Y_hat) results = {"acc":round(acc,3), \ "avgF1":round(avgF1,3), \ "model":model_name, \ "dataset":os.path.basename(test), \ "run_id":run_id, \ "train_size":len(X_train), \ "test_size":len(X_test), \ "hyper":repr(hyperparameters)} cols = ["dataset", "run_id", "acc", "avgF1", "hyper"] helpers.print_results(results, columns=cols) if res_path is not None: cols = ["dataset", "model", "run_id", "acc", "avgF1"] helpers.save_results(results, res_path, sep="\t", columns=cols) return results
def hypertune(train, dev, features, obj, hyperparams, res_path=None): X_train, Y_train = get_features(train, features) X_dev, Y_dev = get_features(dev, features) best_hp = None best_score = 0 for hp in hyperparams: #initialize model with the hyperparameters model = SGDClassifier(random_state=1234, **hp) model.fit(X_train, Y_train) Y_hat = model.predict(X_dev) score = obj(Y_dev, Y_hat) # print "[score: {} | hyperparameters: {}]".format(score, repr(hp)) if score > best_score: best_score = score best_hp = hp results = {"score": round(score, 3), "hyper": repr(hp)} if res_path is not None: helpers.save_results(results, res_path) helpers.print_results(results) print "" print "[best conf: {} | score: {}]".format(repr(best_hp), round(best_score, 3)) return best_hp, best_score
def main(train, test, dev, embs_path, total_epochs=10, weights_file=None, results_path=None): print "[reading data]" train_data = data.read_dataset(train) train_docs = [x[1] for x in train_data] train_Y = [x[0] for x in train_data] test_data = data.read_dataset(test) test_docs = [x[1] for x in test_data] test_Y = [x[0] for x in test_data] dev_data = data.read_dataset(dev) dev_docs = [x[1] for x in dev_data] dev_Y = [x[0] for x in dev_data] #convert labels to one-hot label_map = vectorizer.build_vocabulary(test_Y + train_Y + dev_Y) train_Y = vectorizer.one_hot(label_map, train_Y) dev_Y = vectorizer.one_hot(label_map, dev_Y) test_Y = vectorizer.one_hot(label_map, test_Y) #convert to argmax test_Y = np.argmax(test_Y, axis=1) n_labels = len(train_Y[0]) print "[loading embeddings]" wvs = embeddings.embeddings_to_dict(embs_path) # preprocessor for texts print "[preprocessing...]" all_docs = train_docs + test_docs + dev_docs max_len = max([len(x.split()) for x in all_docs]) print "[max len: {}]".format(max_len) p = CNN_text.Preprocessor(max_features=len(wvs), maxlen=max_len, wvs=wvs) p.preprocess(all_docs) train_X = p.build_sequences(train_docs) test_X = p.build_sequences(test_docs) dev_X = p.build_sequences(dev_docs) # then the CNN cnn = CNN_text.TextCNN(p, n_labels=n_labels, filters=[2, 3], n_filters=50, dropout=0.0) if weights_file: cnn.model.load_weights('weights.hdf5') epochs_per_iter = 1 epochs_so_far = 0 print "training" while epochs_so_far < total_epochs: cnn.train(train_X, train_Y, nb_epoch=epochs_per_iter, X_val=dev_X, y_val=dev_Y) epochs_so_far += epochs_per_iter Y_hat = cnn.predict(dev_X) acc = accuracy_score(np.argmax(dev_Y, axis=1), Y_hat) avgF1 = f1_score(np.argmax(dev_Y, axis=1), Y_hat, average="macro") res={"acc":round(acc,3), \ "avgF1":round(avgF1,3)} helpers.print_results(res) #print("acc @ epoch %s: %s" % (epochs_so_far, acc)) Y_hat = cnn.predict(test_X) acc = accuracy_score(test_Y, Y_hat) avgF1 = f1_score(test_Y, Y_hat, average="macro") results = {"acc":round(acc,3), \ "avgF1":round(avgF1,3), \ "model":"CNN", \ "dataset":os.path.basename(test), \ "run_id":"NEURAL"} helpers.print_results(results) if results_path is not None: cols = ["dataset", "model", "run_id", "acc", "avgF1"] helpers.save_results(results, results_path, cols, sep="\t")
conf, _ = hypertune(tr_fname, ts_fname, args.emb_path,\ scorer, hyperparams_grid, res_path=hyper_results_path) #run model with the best hyperparams res = main(tr_fname, dev_fname, ts_fname, args.emb_path, conf, run_id=args.run_id, res_path=cv_results_path) results.append(res) accs = [res["acc"] for res in results] f1s = [res["avgF1"] for res in results] cv_res = {"acc_mean":round(np.mean(accs),3), \ "acc_std":round(np.std(accs),3), \ "avgF1_mean":round(np.mean(f1s),3), \ "avgF1_std":round(np.std(f1s),3), \ "model":"NLSE", \ "dataset":os.path.basename(args.test), \ "run_id":args.run_id} helpers.print_results(cv_res) #save the results of each run if args.res_path is not None: cols = [ "dataset", "run_id", "model", "acc_mean", "acc_std", "avgF1_mean", "avgF1_std" ] helpers.save_results(cv_res, args.res_path, sep="\t", columns=cols)
#evaluate avgF1 = f1_score(Y_test, Y_hat, average="macro") acc = accuracy_score(Y_test, Y_hat) fname = os.path.basename(args.test) run_id = args.run_id if run_id is None: run_id = "+".join(args.features) results = {"dataset": fname, "acc": round(acc, 3), "avgF1": round(avgF1, 3), "features": "+".join(args.features)+"@"+args.model, "run_id": run_id} cols = ["dataset", "run_id", "features", "acc", "avgF1"] #report helpers.print_results(results, columns=cols) if args.res_path is not None: helpers.save_results(results, args.res_path, columns=cols) elif args.type == "continuous": #choose model if args.model == "linear": predictor = SVR(kernel='linear') elif args.model == "l1": predictor = LinearSVR(loss='epsilon_insensitive') elif args.model == "rbf": predictor = SVR(kernel='rbf') #train and predict predictor.fit(X_train, Y_train) Y_hat = predictor.predict(X_test) #evaluate pred_rank = sp.stats.stats.rankdata(Y_hat) true_rank = sp.stats.stats.rankdata(Y_test) kendal, _ = sp.stats.stats.kendalltau(pred_rank, true_rank)