Exemple #1
0
def hypertune(train, dev, emb_path, obj, hyperparams, res_path=None):

    with open(train, 'rb') as fid:
        X_train, Y_train, vocabulary, _ = cPickle.load(fid)
    with open(dev, 'rb') as fid:
        X_dev, Y_dev, _, _ = cPickle.load(fid)
    E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary)

    best_hp = None
    best_score = 0
    for hp in hyperparams:
        #initialize model with the hyperparameters
        nn = nlse.NLSE(E, **hp)
        nn.fit(X_train, Y_train, X_dev, Y_dev, silent=False)
        Y_hat = nn.predict(X_dev)
        score = obj(Y_dev, Y_hat)
        print "[score: {} | hyperparameters: {}]".format(score, repr(hp))
        if score > best_score:
            best_score = score
            best_hp = hp
        results = {"score": round(score, 3), "hyper": repr(hp)}
        if res_path is not None:
            helpers.save_results(results, res_path, sep="\t")
        helpers.print_results(results)
    print ""
    print "[best conf: {} | score: {}]".format(repr(best_hp), best_score)
    return best_hp, best_score
Exemple #2
0
def main(train, dev, test, emb_path, hyperparams, run_id=None, res_path=None):
    with open(train, 'rb') as fid:
        X_train, Y_train, vocabulary, _ = cPickle.load(fid)
    with open(dev, 'rb') as fid:
        X_dev, Y_dev, _, _ = cPickle.load(fid)
    with open(test, 'rb') as fid:
        test_x, test_y, _, _ = cPickle.load(fid)
    E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary)
    nn = nlse.NLSE(E, **hyperparams)
    nn.fit(X_train, Y_train, X_dev, Y_dev)
    y_hat = nn.predict(test_x)
    avgF1 = f1_score(test_y, y_hat, average="macro")
    acc = accuracy_score(test_y, y_hat)
    run_id = run_id
    dataset = os.path.basename(test)
    hp = {p: hyperparams[p] for p in ["sub_size", "lrate"]}
    if run_id is None: run_id = "NLSE"
    results = {"acc":round(acc,3), \
                "avgF1":round(avgF1,3), \
                "model":"NLSE", \
                "dataset":dataset, \
                "run_id":run_id,
                "sub_size":hyperparams["sub_size"],
                "lrate":hyperparams["lrate"]}
    cols = ["dataset", "model", "run_id", "acc", "avgF1", "sub_size"]
    helpers.print_results(
        results,
        columns=["dataset", "run_id", "lrate", "subsize", "acc", "avgF1"])
    if res_path is not None:
        helpers.save_results(results, res_path, sep="\t", columns=cols)
    return results
Exemple #3
0
def hypertuner(train,
               dev,
               test,
               emb_path,
               obj,
               hyperparams,
               run_id,
               res_path=None,
               model_path=None):
    with open(train, 'rb') as fid:
        X_train, Y_train, vocabulary = cPickle.load(fid)
    with open(dev, 'rb') as fid:
        X_dev, Y_dev, _ = cPickle.load(fid)
    with open(test, 'rb') as fid:
        X_test, Y_test, _ = cPickle.load(fid)
    E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary)
    Ys = Y_train + Y_dev + Y_test
    label_map = vectorizer.build_vocabulary(Ys)
    Y_train = [label_map[y] for y in Y_train]
    Y_test = [label_map[y] for y in Y_test]
    Y_dev = [label_map[y] for y in Y_dev]

    dataset = os.path.basename(test)
    best_hp = None
    best_score = 0
    best_results = None

    for hp in hyperparams:
        #initialize model with the hyperparameters
        nn = nlse.NLSE(E, label_map=label_map, vocab=vocabulary, **hp)
        # nn = nlse.NLSE(E, **hp)
        nn.fit(X_train, Y_train, X_dev, Y_dev, silent=False)
        Y_hat = nn.predict(X_test)
        score = obj(Y_test, Y_hat)
        print "[score: {} | hyperparameters: {}]".format(score, repr(hp))
        if score > best_score:
            if model_path is not None:
                nn.save(model_path)
            best_score = score
            best_hp = hp
            acc = accuracy_score(Y_test, Y_hat)
            avgF1 = f1_score(Y_test, Y_hat, average="macro")
            rep_hp = {p: hp[p] for p in ["sub_size", "lrate"]}
            best_results = {"acc":round(acc,3), \
                    "avgF1":round(avgF1,3), \
                    "model":"NLSE", \
                    "dataset":dataset, \
                    "run_id":run_id,
                    "hyper":repr(rep_hp)}
        res = {"score": round(score, 3), "hyper": repr(hp)}
        helpers.print_results(res)

    if res_path is not None:
        cols = ["dataset", "model", "run_id", "acc", "avgF1"]
        helpers.save_results(best_results, res_path, cols, sep="\t")
    print ""
    print "[best conf: {} | score: {}]".format(repr(best_hp), best_score)
    return best_hp, best_score
Exemple #4
0
def run(inputs, opts):
    for dataset in inputs:
        print "[extracting features @ {}]".format(repr(dataset))
        E = None
        with open(dataset, "rb") as fid:
            X, Y, vocabulary = cPickle.load(fid)
            basename = os.path.splitext(os.path.basename(dataset))[0]
            if opts.bow is not None:
                for agg in opts.bow:
                    fname = basename + "-bow-" + agg.lower()
                    print "\t > BOW ({})".format(fname)
                    if agg == "bin":
                        bow = features.BOW(X, vocabulary, opts.sparse_bow)
                    elif agg == "freq":
                        bow = features.BOW_freq(X, vocabulary, opts.sparse_bow)
                    np.save(opts.out_folder + fname, bow)
            if opts.boe is not None:
                for agg in opts.boe:
                    fname = basename + "-boe-" + agg.lower()
                    print "\t > BOE ({})".format(fname)
                    E, _ = embeddings.read_embeddings(opts.embeddings,
                                                      wrd2idx=vocabulary)
                    boe = features.BOE(X, E, agg=agg)
                    np.save(opts.out_folder + fname, boe)
            if opts.w2v:
                fname = basename + "-w2v"
                print "\t > W2V ({})".format(fname)
                E, _ = embeddings.read_embeddings(opts.embeddings,
                                                  wrd2idx=vocabulary)
                emb = features.BOE(X, E, agg="bin")
                np.save(opts.out_folder + fname, emb)
            if opts.u2v:
                fname = basename + "-u2v"
                print "\t > u2v ({})".format(fname)
                E, _ = embeddings.read_embeddings(opts.embeddings,
                                                  wrd2idx=vocabulary)
                emb = features.BOE(X, E, agg="bin")
                np.save(opts.out_folder + fname, emb)
            if opts.nlse:
                fname = basename + "_NLSE.pkl"
                E, _ = embeddings.read_embeddings(opts.embeddings,
                                                  wrd2idx=vocabulary)
                np.save(fname, E)
Exemple #5
0
def main(train,
         dev,
         test,
         emb_path,
         hyperparams,
         run_id=None,
         res_path=None,
         no_hidden=False):
    with open(train, 'rb') as fid:
        X_train, Y_train, vocabulary = cPickle.load(fid)
    with open(dev, 'rb') as fid:
        X_dev, Y_dev, _ = cPickle.load(fid)
    with open(test, 'rb') as fid:
        X_test, Y_test, _ = cPickle.load(fid)
    E, _ = embeddings.read_embeddings(emb_path, wrd2idx=vocabulary)
    Ys = Y_train + Y_dev + Y_test
    label_map = vectorizer.build_vocabulary(Ys)
    Y_train = [label_map[y] for y in Y_train]
    Y_test = [label_map[y] for y in Y_test]
    Y_dev = [label_map[y] for y in Y_dev]
    print "[no hidden: {}]".format(no_hidden)
    # set_trace()
    if no_hidden:
        del hyperparams["sub_size"]
        nn = nlse.BOE_plus(E,
                           label_map=label_map,
                           vocab=vocabulary,
                           **hyperparams)
    else:
        nn = nlse.NLSE(E, label_map=label_map, vocab=vocabulary, **hyperparams)
    nn.fit(X_train, Y_train, X_dev, Y_dev)
    y_hat = nn.predict(X_test)
    avgF1 = f1_score(Y_test, y_hat, average="macro")
    acc = accuracy_score(Y_test, y_hat)
    run_id = run_id
    dataset = os.path.basename(test)
    #    hp = {p:hyperparams[p] for p in ["sub_size","lrate"]}
    if run_id is None: run_id = "NLSE"
    results = {"acc":round(acc,3), \
                "avgF1":round(avgF1,3), \
                "model":"NLSE", \
                "dataset":dataset, \
                "run_id":run_id}

    helpers.print_results(results,
                          columns=["dataset", "run_id", "acc", "avgF1"])
    if res_path is not None:
        cols = ["dataset", "model", "run_id", "acc", "avgF1"]
        helpers.save_results(results, res_path, sep="\t", columns=cols)
    return results, nn
Exemple #6
0
	t0 = time.time()
	word_counter = Counter()
	n_docs=0	
	with open(args.input,"r") as fid:	
		for line in fid:			
			message = line.decode("utf-8").split()[1:]
			word_counter.update(message)				
			n_docs+=1
	#keep only words that occur at least min_word_freq times
	wc = {w:c for w,c in word_counter.items() if c>args.min_word_freq} 
	#keep only the args.vocab_size most frequent words
	tw = sorted(wc.items(), key=lambda x:x[1],reverse=True)
	top_words = {w[0]:i for i,w in enumerate(tw[:args.vocab_size])}	
	print "loading word embeddings..."		
	full_E, full_wrd2idx = emb_utils.read_embeddings(args.emb,top_words)
	ooevs = emb_utils.get_OOEVs(full_E, full_wrd2idx)
	#keep only words with pre-trained embeddings
	old_len = len(top_words)
	for w in ooevs:
		del top_words[w]	
	wrd2idx = {w:i for i,w in enumerate(top_words.keys())}	
	print "[vocabulary size: %d|%d]" % (len(wrd2idx),old_len)
	#generate the embedding matrix
	emb_size = full_E.shape[0]
	E = np.zeros((int(emb_size), len(wrd2idx)))   
	for wrd,idx in wrd2idx.items(): 
		E[:, idx] = full_E[:,top_words[wrd]]

	print "building training data..."
	#negative sampler