def main(text_paths, label_paths, wrd2idx, user2idx, opts): user_datasets = [] for fname in label_paths: print "[reading user data @ {}]".format(repr(fname)) ds_users = read_dataset(fname) user_datasets.append(ds_users) # document_datasets.append(ds_docs) #vectorize print "[vectorizing users]" for name, ds in zip(label_paths, user_datasets): X, Y = vectorize(ds, user2idx) basename = os.path.splitext(os.path.basename(name))[0] path = opts.out_folder + basename + "_users" print "[saving data @ {}]".format(path) with open(path, "wb") as fid: cPickle.dump([X, Y, user2idx], fid, -1) if not opts.users_only: #read text data text_data = [] for fname in text_paths: print "[reading text data @ {}]".format(repr(fname)) text_data += read_dataset(fname) #index tweets per user text_by_user = {x[0]: x[1] for x in text_data} print "[vectorizing documents]" for name, ds in zip(label_paths, user_datasets): ds_docs = [[y, text_by_user[u]] for y, u in ds] X, Y = vectorize(ds_docs, wrd2idx) basename = os.path.splitext(os.path.basename(name))[0] path = opts.out_folder + basename print "[saving data @ {}]".format(path) with open(path, "wb") as fid: cPickle.dump([X, Y, wrd2idx], fid, -1)
def main(lex_path, test, label_map, run_id, conf={}, dev=None, res_path=None): #read test data dt = read_dataset(test, labels=label_map.keys()) X_test = [x[1] for x in dt] Y_test = [label_map[x[0]] for x in dt] #model model = LexiconSentiment(path=lex_path,**conf) #if dev data is passed, use this data to fit the threshold if dev is not None: dt_dev = read_dataset(dev, labels=label_map.keys()) X_dev = [x[1] for x in dt_dev] Y_dev = [label_map[x[0]] for x in dt_dev] print "[fitting]" model.fit(X_dev,Y_dev,samples=SAMPLEZ,silent=True) conf = model.get_params() #test model Y_hat = model.predict(X_test) avgF1 = f1_score(Y_test, Y_hat,average="macro") acc = accuracy_score(Y_test, Y_hat) results = {"acc":round(acc,3), "avgF1":round(avgF1,3), "model":run_id, "dataset":os.path.basename(test), "run_id":run_id } results.update(conf) cols = ["dataset", "model", "run_id", "acc", "avgF1"] helpers.print_results(results, columns=cols) if res_path is not None: #cols+=["positive_threshold","keep_scores_below","keep_scores_above"] helpers.save_results(results, res_path, sep="\t", columns=cols) return results
def get_vocabularies(text_paths, label_paths, max_words=None, users_only=False): """ compute vocabulary using the texts from users only considering the users from a specific set of labeled datasets text_path: path to a file with all the text from a user in the format: USER TAB TEXT label_paths: list of paths to files with the labels in the format: LABEL USER """ #read the labeled sets datasets = [] for fname in label_paths: ds = read_dataset(fname) datasets.append(ds) vocab_datasets = [x[1] for x in flatten_list(datasets)] words_vocab = None if not users_only: text_data = [] for fname in text_paths: text_data += read_dataset(fname) #index tweets per user text_by_user = {x[0]: x[1] for x in text_data} vocab_docs = [text_by_user[x] for x in vocab_datasets] words_vocab = build_vocabulary(vocab_docs, max_words=max_words) users_vocab = build_vocabulary([x for x in vocab_datasets]) return words_vocab, users_vocab
def hypertune(lex_path, dev, label_map, obj, hyperparams, res_path=None): dt = read_dataset(dev, labels=label_map.keys()) X = [x[1] for x in dt] Y = [label_map[x[0]] for x in dt] best_hp = hyperparams[0] best_score = 0 for hp in hyperparams: #initialize model with the hyperparameters model = LexiconSentiment(path=lex_path,**hp) model.fit(X,Y,samples=SAMPLEZ,silent=True) Y_hat = model.predict(X) score = obj(Y, Y_hat) if score > best_score: #replace the original configuration with the one returned by the model, which #contains also the fitted parameters hp = model.get_params() best_score = score best_hp = hp p_hp = {k:hp[k] for k in ["keep_scores_below","keep_scores_above","positive_threshold"]} print "[hyperparameters: {} | score: {} ]".format(repr(p_hp), round(score,3)) # set_trace() results = {"score":round(score,3), "hyper":repr(p_hp)} if res_path is not None: helpers.save_results(results,res_path, sep="\t") # helpers.print_results(results) print "" print "[best conf: {} | score: {}]".format(repr(best_hp),best_score) return best_hp, best_score
def get_vocabulary(fnames, max_words=None): datasets = [] for fname in fnames: ds = read_dataset(fname) datasets.append(ds) vocab_docs = [x[1] for x in flatten_list(datasets)] vocab = build_vocabulary(vocab_docs, max_words=max_words) return vocab
def main(fnames, vocab, opts): #read data datasets = [] for fname in fnames: print "[reading data @ {}]".format(repr(fname)) ds = read_dataset(fname) datasets.append(ds) #vectorize print "[vectorizing documents]" for name, ds in zip(fnames, datasets): X, Y = vectorize(ds, vocab) basename = os.path.splitext(os.path.basename(name))[0] path = opts.out_folder + basename print "[saving data @ {}]".format(path) with open(path, "wb") as fid: cPickle.dump([X, Y, vocab], fid, -1) return vocab
def debug(lex_path, test, label_map, conf, report_path): dt = read_dataset(test, labels=label_map.keys()) X = [x[1] for x in dt] Y = [label_map[x[0]] for x in dt] model = LexiconSentiment(path=lex_path,**conf) z = model.debug(X) out = [[true_y] + y_hat for true_y, y_hat in zip(Y,z)] df = pd.DataFrame(out,columns=["y", "y_hat", "score", "std", "word_scores"]) df.sort_values("score",inplace=True,ascending=False) if not os.path.exists(os.path.dirname(report_path)): os.makedirs(os.path.dirname(report_path)) #all instances summary(os.path.join(report_path, "all.txt"),df) #true positives true_positives = df[df["y"] == 1] summary(os.path.join(report_path,"positives.txt"), true_positives) #true negatives true_negatives = df[df["y"] == -1] summary(os.path.join(report_path,"negatives.txt"),true_negatives) #mistakes mistakes = df[df["y"] != df["y_hat"]] summary(os.path.join(report_path,"mistakes.txt"),mistakes)
def main(train, test, dev, embs_path, total_epochs=10, weights_file=None, results_path=None): print "[reading data]" train_data = data.read_dataset(train) train_docs = [x[1] for x in train_data] train_Y = [x[0] for x in train_data] test_data = data.read_dataset(test) test_docs = [x[1] for x in test_data] test_Y = [x[0] for x in test_data] dev_data = data.read_dataset(dev) dev_docs = [x[1] for x in dev_data] dev_Y = [x[0] for x in dev_data] #convert labels to one-hot label_map = vectorizer.build_vocabulary(test_Y + train_Y + dev_Y) train_Y = vectorizer.one_hot(label_map, train_Y) dev_Y = vectorizer.one_hot(label_map, dev_Y) test_Y = vectorizer.one_hot(label_map, test_Y) #convert to argmax test_Y = np.argmax(test_Y, axis=1) n_labels = len(train_Y[0]) print "[loading embeddings]" wvs = embeddings.embeddings_to_dict(embs_path) # preprocessor for texts print "[preprocessing...]" all_docs = train_docs + test_docs + dev_docs max_len = max([len(x.split()) for x in all_docs]) print "[max len: {}]".format(max_len) p = CNN_text.Preprocessor(max_features=len(wvs), maxlen=max_len, wvs=wvs) p.preprocess(all_docs) train_X = p.build_sequences(train_docs) test_X = p.build_sequences(test_docs) dev_X = p.build_sequences(dev_docs) # then the CNN cnn = CNN_text.TextCNN(p, n_labels=n_labels, filters=[2, 3], n_filters=50, dropout=0.0) if weights_file: cnn.model.load_weights('weights.hdf5') epochs_per_iter = 1 epochs_so_far = 0 print "training" while epochs_so_far < total_epochs: cnn.train(train_X, train_Y, nb_epoch=epochs_per_iter, X_val=dev_X, y_val=dev_Y) epochs_so_far += epochs_per_iter Y_hat = cnn.predict(dev_X) acc = accuracy_score(np.argmax(dev_Y, axis=1), Y_hat) avgF1 = f1_score(np.argmax(dev_Y, axis=1), Y_hat, average="macro") res={"acc":round(acc,3), \ "avgF1":round(avgF1,3)} helpers.print_results(res) #print("acc @ epoch %s: %s" % (epochs_so_far, acc)) Y_hat = cnn.predict(test_X) acc = accuracy_score(test_Y, Y_hat) avgF1 = f1_score(test_Y, Y_hat, average="macro") results = {"acc":round(acc,3), \ "avgF1":round(avgF1,3), \ "model":"CNN", \ "dataset":os.path.basename(test), \ "run_id":"NEURAL"} helpers.print_results(results) if results_path is not None: cols = ["dataset", "model", "run_id", "acc", "avgF1"] helpers.save_results(results, results_path, cols, sep="\t")
par.add_argument('-dev', type=str, help='dev split path') # par.add_argument('-output', type=str, required=True, nargs=2, help='output files') par.add_argument('-split', type=float, default=0.8, help='data split') par.add_argument('-no_strat', action="store_true", help="do not stratified data for split") par.add_argument('-rand_seed', type=str, default="1234", help='randomization seed') par.add_argument('-cv', type=int, help="k-fold crossvalidation") return par if __name__ == "__main__": parser = get_parser() args = parser.parse_args() try: seed = int(args.rand_seed) except ValueError: seed = str2seed(args.rand_seed) datasets = [data_reader.read_dataset(d) for d in args.input] datasets = data_reader.flatten_list(datasets) if args.cv is not None: print "[seed ({}) | input: {} | cv: {} | strat: {}]".format(seed, args.input, \ args.cv, \ not args.no_strat) folds = data_reader.crossfolds(datasets, args.cv) # set_trace() for i, (train_split, test_split) in enumerate(folds): tr_fname = args.train+"_"+str(i+1) ts_fname = args.test+"_"+str(i+1) if args.dev is not None: dev_fname = args.dev+"_"+str(i+1) print "[saving: {} | {} | {} ]".format(tr_fname, ts_fname, dev_fname) train_split, dev_split = data_reader.shuffle_split(train_split, args.split, \ random_seed=seed)
def get_argparser(): parser = argparse.ArgumentParser(prog='NLSE model runner') parser.add_argument('-model_path', help='Path where model is saved', type=str, required=True) parser.add_argument('-data_path', required=True, type=str, help='training data') parser.add_argument('-res_path', type=str, help='results file') args = parser.parse_args(sys.argv[1:]) return args if __name__ == '__main__': # ARGUMENT HANDLING args = get_argparser() clf = nlse.load_model(args.model_path) dataset = data.read_dataset(args.data_path) docs = [x[1] for x in dataset] labels = [x[0] for x in dataset] X, _ = vectorizer.docs2idx(docs, clf.vocab) #map numeric labels to text inv_label_map = {ix: label for label, ix in clf.label_map.items()} y_hat = [inv_label_map[y] for y in clf.predict(X)] with open(args.res_path, "w") as fod: for y, x in zip(labels, y_hat): fod.write("{}\t{}\n".format(x, y))