Esempio n. 1
0
def main(text_paths, label_paths, wrd2idx, user2idx, opts):
    user_datasets = []
    for fname in label_paths:
        print "[reading user data @ {}]".format(repr(fname))
        ds_users = read_dataset(fname)
        user_datasets.append(ds_users)
        # document_datasets.append(ds_docs)
    #vectorize
    print "[vectorizing users]"
    for name, ds in zip(label_paths, user_datasets):
        X, Y = vectorize(ds, user2idx)
        basename = os.path.splitext(os.path.basename(name))[0]
        path = opts.out_folder + basename + "_users"
        print "[saving data @ {}]".format(path)
        with open(path, "wb") as fid:
            cPickle.dump([X, Y, user2idx], fid, -1)

    if not opts.users_only:
        #read text data
        text_data = []
        for fname in text_paths:
            print "[reading text data @ {}]".format(repr(fname))
            text_data += read_dataset(fname)
        #index tweets per user
        text_by_user = {x[0]: x[1] for x in text_data}
        print "[vectorizing documents]"
        for name, ds in zip(label_paths, user_datasets):
            ds_docs = [[y, text_by_user[u]] for y, u in ds]
            X, Y = vectorize(ds_docs, wrd2idx)
            basename = os.path.splitext(os.path.basename(name))[0]
            path = opts.out_folder + basename
            print "[saving data @ {}]".format(path)
            with open(path, "wb") as fid:
                cPickle.dump([X, Y, wrd2idx], fid, -1)
Esempio n. 2
0
def main(lex_path, test, label_map, run_id, conf={}, dev=None, res_path=None):
	#read test data
	dt = read_dataset(test, labels=label_map.keys())
	X_test = [x[1] for x in dt]
	Y_test = [label_map[x[0]] for x in dt]
	#model
	model = LexiconSentiment(path=lex_path,**conf)
	#if dev data is passed, use this data to fit the threshold
	if dev is not None:
		dt_dev = read_dataset(dev, labels=label_map.keys())
		X_dev = [x[1] for x in dt_dev]
		Y_dev = [label_map[x[0]] for x in dt_dev]		
		print "[fitting]"
		model.fit(X_dev,Y_dev,samples=SAMPLEZ,silent=True)
		conf = model.get_params()
	#test model
	Y_hat = model.predict(X_test)		
	avgF1 = f1_score(Y_test, Y_hat,average="macro") 		
	acc = accuracy_score(Y_test, Y_hat)				
	
	results = {"acc":round(acc,3), 
			   "avgF1":round(avgF1,3),	
				"model":run_id, 
				"dataset":os.path.basename(test), 
				"run_id":run_id
				}
	results.update(conf)
	cols = ["dataset", "model", "run_id", "acc", "avgF1"]	
	helpers.print_results(results, columns=cols)
	if res_path is not None:
		#cols+=["positive_threshold","keep_scores_below","keep_scores_above"]
		helpers.save_results(results, res_path, sep="\t", columns=cols)	
	return results
Esempio n. 3
0
def get_vocabularies(text_paths,
                     label_paths,
                     max_words=None,
                     users_only=False):
    """
		compute vocabulary using the texts from users only considering the users from a specific set of labeled datasets

		text_path: path to a file with all the text from a user in the format: USER TAB TEXT
		label_paths: list of paths to files with the labels in the format: LABEL USER

	"""
    #read the labeled sets
    datasets = []
    for fname in label_paths:
        ds = read_dataset(fname)
        datasets.append(ds)
    vocab_datasets = [x[1] for x in flatten_list(datasets)]
    words_vocab = None

    if not users_only:
        text_data = []
        for fname in text_paths:
            text_data += read_dataset(fname)
        #index tweets per user
        text_by_user = {x[0]: x[1] for x in text_data}
        vocab_docs = [text_by_user[x] for x in vocab_datasets]
        words_vocab = build_vocabulary(vocab_docs, max_words=max_words)

    users_vocab = build_vocabulary([x for x in vocab_datasets])
    return words_vocab, users_vocab
Esempio n. 4
0
def hypertune(lex_path, dev, label_map,  obj, hyperparams, res_path=None):
	dt = read_dataset(dev, labels=label_map.keys())
	X = [x[1] for x in dt]
	Y = [label_map[x[0]] for x in dt]		
	best_hp = hyperparams[0]
	best_score = 0	
	for hp in hyperparams:
		#initialize model with the hyperparameters			
		model = LexiconSentiment(path=lex_path,**hp)
		model.fit(X,Y,samples=SAMPLEZ,silent=True)	
		Y_hat = model.predict(X)					
		score = obj(Y, Y_hat)		
		if score > best_score:
			#replace the original configuration with the one returned by the model, which
			#contains also the fitted parameters
			hp = model.get_params()
			best_score = score			
			best_hp = hp
			p_hp = {k:hp[k] for k in ["keep_scores_below","keep_scores_above","positive_threshold"]}
			print "[hyperparameters: {} | score: {} ]".format(repr(p_hp), round(score,3))
			# set_trace()
		
		results = {"score":round(score,3), "hyper":repr(p_hp)}
		if res_path is not None:
			helpers.save_results(results,res_path, sep="\t")
		# helpers.print_results(results)
	print ""
	print "[best conf: {} | score: {}]".format(repr(best_hp),best_score)
	return best_hp, best_score
Esempio n. 5
0
def get_vocabulary(fnames, max_words=None):
    datasets = []
    for fname in fnames:
        ds = read_dataset(fname)
        datasets.append(ds)
    vocab_docs = [x[1] for x in flatten_list(datasets)]
    vocab = build_vocabulary(vocab_docs, max_words=max_words)
    return vocab
Esempio n. 6
0
def main(fnames, vocab, opts):
    #read data
    datasets = []
    for fname in fnames:
        print "[reading data @ {}]".format(repr(fname))
        ds = read_dataset(fname)
        datasets.append(ds)
    #vectorize
    print "[vectorizing documents]"
    for name, ds in zip(fnames, datasets):
        X, Y = vectorize(ds, vocab)
        basename = os.path.splitext(os.path.basename(name))[0]
        path = opts.out_folder + basename
        print "[saving data @ {}]".format(path)
        with open(path, "wb") as fid:
            cPickle.dump([X, Y, vocab], fid, -1)
    return vocab
Esempio n. 7
0
def debug(lex_path, test, label_map, conf, report_path):
	dt = read_dataset(test, labels=label_map.keys())
	X = [x[1] for x in dt]
	Y = [label_map[x[0]] for x in dt]
	model = LexiconSentiment(path=lex_path,**conf)	
	z = model.debug(X)		
	out = [[true_y] + y_hat for true_y, y_hat in zip(Y,z)]	
	df = pd.DataFrame(out,columns=["y", "y_hat", "score", "std", "word_scores"])
	df.sort_values("score",inplace=True,ascending=False)	
	if not os.path.exists(os.path.dirname(report_path)):
		os.makedirs(os.path.dirname(report_path))
	#all instances
	summary(os.path.join(report_path, "all.txt"),df)	
	#true positives
	true_positives = df[df["y"] == 1]
	summary(os.path.join(report_path,"positives.txt"), true_positives)	
	#true negatives
	true_negatives = df[df["y"] == -1]
	summary(os.path.join(report_path,"negatives.txt"),true_negatives)
	#mistakes
	mistakes = df[df["y"] != df["y_hat"]]	
	summary(os.path.join(report_path,"mistakes.txt"),mistakes)
Esempio n. 8
0
def main(train,
         test,
         dev,
         embs_path,
         total_epochs=10,
         weights_file=None,
         results_path=None):
    print "[reading data]"
    train_data = data.read_dataset(train)
    train_docs = [x[1] for x in train_data]
    train_Y = [x[0] for x in train_data]

    test_data = data.read_dataset(test)
    test_docs = [x[1] for x in test_data]
    test_Y = [x[0] for x in test_data]

    dev_data = data.read_dataset(dev)
    dev_docs = [x[1] for x in dev_data]
    dev_Y = [x[0] for x in dev_data]

    #convert labels to one-hot
    label_map = vectorizer.build_vocabulary(test_Y + train_Y + dev_Y)
    train_Y = vectorizer.one_hot(label_map, train_Y)
    dev_Y = vectorizer.one_hot(label_map, dev_Y)
    test_Y = vectorizer.one_hot(label_map, test_Y)
    #convert to argmax
    test_Y = np.argmax(test_Y, axis=1)
    n_labels = len(train_Y[0])
    print "[loading embeddings]"
    wvs = embeddings.embeddings_to_dict(embs_path)
    # preprocessor for texts
    print "[preprocessing...]"
    all_docs = train_docs + test_docs + dev_docs
    max_len = max([len(x.split()) for x in all_docs])
    print "[max len: {}]".format(max_len)
    p = CNN_text.Preprocessor(max_features=len(wvs), maxlen=max_len, wvs=wvs)
    p.preprocess(all_docs)
    train_X = p.build_sequences(train_docs)
    test_X = p.build_sequences(test_docs)
    dev_X = p.build_sequences(dev_docs)
    # then the CNN
    cnn = CNN_text.TextCNN(p,
                           n_labels=n_labels,
                           filters=[2, 3],
                           n_filters=50,
                           dropout=0.0)

    if weights_file:
        cnn.model.load_weights('weights.hdf5')

    epochs_per_iter = 1
    epochs_so_far = 0
    print "training"
    while epochs_so_far < total_epochs:
        cnn.train(train_X,
                  train_Y,
                  nb_epoch=epochs_per_iter,
                  X_val=dev_X,
                  y_val=dev_Y)
        epochs_so_far += epochs_per_iter
        Y_hat = cnn.predict(dev_X)
        acc = accuracy_score(np.argmax(dev_Y, axis=1), Y_hat)
        avgF1 = f1_score(np.argmax(dev_Y, axis=1), Y_hat, average="macro")
        res={"acc":round(acc,3), \
            "avgF1":round(avgF1,3)}
        helpers.print_results(res)
        #print("acc @ epoch %s: %s" % (epochs_so_far, acc))

    Y_hat = cnn.predict(test_X)
    acc = accuracy_score(test_Y, Y_hat)
    avgF1 = f1_score(test_Y, Y_hat, average="macro")

    results = {"acc":round(acc,3), \
            "avgF1":round(avgF1,3), \
            "model":"CNN", \
            "dataset":os.path.basename(test), \
            "run_id":"NEURAL"}
    helpers.print_results(results)
    if results_path is not None:
        cols = ["dataset", "model", "run_id", "acc", "avgF1"]
        helpers.save_results(results, results_path, cols, sep="\t")
Esempio n. 9
0
	par.add_argument('-dev', type=str, help='dev split path')
	# par.add_argument('-output', type=str, required=True, nargs=2, help='output files')
	par.add_argument('-split', type=float, default=0.8, help='data split')
	par.add_argument('-no_strat', action="store_true", help="do not stratified data for split")
	par.add_argument('-rand_seed', type=str, default="1234", help='randomization seed')
	par.add_argument('-cv', type=int, help="k-fold crossvalidation")
	return par

if __name__ == "__main__":
	parser = get_parser()
	args = parser.parse_args()	
	try:
		seed = int(args.rand_seed)
	except ValueError:		
		seed = str2seed(args.rand_seed)	
	datasets = [data_reader.read_dataset(d)	for d in args.input]
	datasets = data_reader.flatten_list(datasets)	
	if args.cv is not None:
		print "[seed ({}) | input: {} | cv: {} | strat: {}]".format(seed, args.input, \
																	args.cv, \
																	not args.no_strat)
		folds = data_reader.crossfolds(datasets, args.cv)
		# set_trace()
		for i, (train_split, test_split) in enumerate(folds):
			tr_fname = args.train+"_"+str(i+1)
			ts_fname = args.test+"_"+str(i+1)
			if args.dev is not None:
				dev_fname = args.dev+"_"+str(i+1)
				print "[saving: {} | {} | {} ]".format(tr_fname, ts_fname, dev_fname)	
				train_split, dev_split = data_reader.shuffle_split(train_split, args.split, \
																random_seed=seed)	
Esempio n. 10
0
def get_argparser():
    parser = argparse.ArgumentParser(prog='NLSE model runner')
    parser.add_argument('-model_path',
                        help='Path where model is saved',
                        type=str,
                        required=True)
    parser.add_argument('-data_path',
                        required=True,
                        type=str,
                        help='training data')
    parser.add_argument('-res_path', type=str, help='results file')
    args = parser.parse_args(sys.argv[1:])
    return args


if __name__ == '__main__':
    # ARGUMENT HANDLING
    args = get_argparser()
    clf = nlse.load_model(args.model_path)
    dataset = data.read_dataset(args.data_path)
    docs = [x[1] for x in dataset]
    labels = [x[0] for x in dataset]
    X, _ = vectorizer.docs2idx(docs, clf.vocab)
    #map numeric labels to text
    inv_label_map = {ix: label for label, ix in clf.label_map.items()}
    y_hat = [inv_label_map[y] for y in clf.predict(X)]
    with open(args.res_path, "w") as fod:
        for y, x in zip(labels, y_hat):
            fod.write("{}\t{}\n".format(x, y))