def main(): # Argument parsing parser = argparse.ArgumentParser('Lexical Entailment Classifier') parser.add_argument('--data', '-d', help='Input file') parser.add_argument('--space', '-s', help='Distributional space') parser.add_argument('--model', '-m', help='Model setup', choices=models.SETUPS.keys()) parser.add_argument('--experiment', '-e', default='standard', choices=('standard', 'random', 'match_error', 'featext', 'strat', 'levy')) parser.add_argument('--stratifier') parser.add_argument('--output', '-o') args = parser.parse_args() logger.debug('Lexent Arguments: ') logger.debug(args) # Steps that are the same regardless of experiments logger.debug("Loading space") nn_space = load_numpy(args.space, insertblank=True) space = nn_space.normalize() # Handle vocabulary issues logger.debug("Reading data") data = load_data(args.data, space) logger.debug(" Model: %s" % args.model) model, features, hyper = models.load_setup(args.model) logger.debug(" Features: %s" % features) X, y = models.generate_feature_matrix(data, space, features) if args.experiment == 'standard': standard_experiment(data, X, y, model, hyper, args) elif args.experiment == 'featext': feature_extraction(X, y, model, space, data)
def main(): parser = argparse.ArgumentParser('description') parser.add_argument('--logfolder', '-l', help='Log folder.') parser.add_argument('--csvfolder', '-c', help='Output CSV folder for graphs.') parser.add_argument('--output', '-o', help='Folder for saving output models.') parser.add_argument('--model', '-m', help='Selects a particular model.') parser.add_argument('--maxbatches', '-B', default=0, type=int, help='Maximum number of batches to process (in thousands).') parser.add_argument('--batchsize', '-b', type=int, default=BATCH_SIZE, help='Batch size') parser.add_argument('--dimensions', '-d', type=int, default=0, help='Number of dimensions from the space to use. If 0 (default), use all.') parser.add_argument('--learningrate', '-r', type=float, default=LEARNING_RATE, help='Learning rate') args = parser.parse_args() logger.debug("Reading distributional space '%s'" % SPACE_FILENAME) space = load_numpy(SPACE_FILENAME, insertblank=True) if args.dimensions: space.matrix = space.matrix[:,:args.dimensions] if True: m = space.matrix norm_mean = m[1:].mean(axis=0) norm_std = (m[1:].std(axis=0) * 10) m = (m - norm_mean) / norm_std m[0] = 0 space.matrix = m #space = space.normalize() logger.debug("Finished reading space") logger.debug("Space contains %d words with %d dimensions each." % space.matrix.shape) cbr = CorpusBatchReader(CORPUS_FOLDER, space, batch_size=args.batchsize) data_iterator = DataIterator(cbr, epochs=1, maxbatches=args.maxbatches * 1000) HIDDEN = space.matrix.shape[1] logger.debug("Compiling compute graph") R = data_iterator.test[0].shape[1] model = models.get_model(args.model, space, R, HIDDEN, args.learningrate) modelinfo = { 'model': args.model, 'learningrate': args.learningrate, 'hidden': HIDDEN, 'space': SPACE_FILENAME, 'dimensions': space.matrix.shape[1], } filename = _generate_filename(modelinfo) csvlog = CSVLogger(os.path.join(args.csvfolder, filename + ".csv")) logger.debug("Compilation finished") if DEBUG: logger.debug("Theano compute graph:\n" + debugprint(model._train.maker.fgraph.outputs[0], file='str')) logger.debug("Starting training") start_time = datetime.now() for X, Y in data_iterator: trainscore = model.train_on_batch(X, Y) if data_iterator.batch % 1000 == 0: valscore = model.evaluate(*data_iterator.val, verbose=False) testscore = model.evaluate(*data_iterator.test, verbose=False) progress = data_iterator.progress() elapsed = (datetime.now() - start_time) rank = intrinsic_eval(model, space, data_iterator.test[0], data_iterator.test[1]) #rank = 0.0 eta = _compute_eta(start_time, progress) batchinfo = dict( epoch=data_iterator.epoch, kbatch=data_iterator.batch/1000, trainscore=trainscore, valscore=valscore, testscore=testscore, intrinsic=rank, progress=100 * progress, elapsed=elapsed.total_seconds(), eta=eta ) info = _dictmerge(batchinfo, modelinfo) logger.debug("%(epoch)3d ep %(kbatch)8d Kba %(intrinsic)6.4f / %(valscore)8.5f / %(testscore)8.5f [%(progress)5.1f%% eta %(eta)s]" % info) del info['eta'] csvlog.append(info) if data_iterator.batch % 5000 == 0: checkpoint_filename = os.path.join(args.output, "%s__batch%08d.hd5" % (filename, data_iterator.batch)) logger.debug("Checkpointing model to %s" % checkpoint_filename) model.save_weights(checkpoint_filename, overwrite=True)
parser.add_argument('--output') parser.add_argument('--detailed', action='store_true') ARGS = parser.parse_args() sys.stderr.write("Arguments:\n") for k, v in ARGS.__dict__.iteritems(): sys.stderr.write(" %s: %s\n" % (k, v)) sys.stderr.write("\n") sys.stderr.write("Using data file: %s\n" % DATA_FILE) sys.stderr.write("Using test file: %s\n" % TEST_FILE) sys.stderr.write("Loading spaces...\n") bow_space = utdeftvs.load_numpy(WINDOW_SPACE) dep_space = utdeftvs.load_numpy(DEP_SPACE, True) sys.stderr.write("Reading data...\n") data = pd.read_table(DATA_FILE) test_data = pd.read_table(TEST_FILE) sys.stderr.write("Parsing...\n") data['corenlp_left'] = parse_sentences(data['text']) data['corenlp_right'] = parse_sentences(data['hypothesis']) test_data['corenlp_left'] = parse_sentences(test_data['text']) test_data['corenlp_right'] = parse_sentences(test_data['hypothesis']) if ARGS.solo: # only look at items that need a single rule pairCounts = Counter(data['pairIndex'])
def main(): parser = argparse.ArgumentParser('Lexical Entailment Classifier') parser.add_argument('--data', '-d', help='Input file') parser.add_argument('--space', '-s', help='Distributional space') parser.add_argument('--seed', '-S', default=1, type=int, help='Random seed') parser.add_argument('--model', '-m', help='Model setup', choices=models.SETUPS.keys()) parser.add_argument('--experiment', '-e', default='standard', choices=('standard', 'match_error')) parser.add_argument('--output', '-o', default='results') args = parser.parse_args() logger.debug('Lexent Arguments: ') logger.debug(args) rng = np.random.RandomState(args.seed) logger.debug("Loading space") space = load_numpy(args.space).normalize() # Handle vocabulary issues logger.debug("Reading data") data = pd.read_table("data/%s/data.tsv" % args.data, header=None, names=('word1', 'word2', 'entails')) data['word1'] = data['word1'].apply(lambda x: x.lower() + '/NN') data['word2'] = data['word2'].apply(lambda x: x.lower() + '/NN') mask1 = data.word1.apply(lambda x: x in space.lookup) mask2 = data.word2.apply(lambda x: x in space.lookup) T = len(data) M1T = np.sum(mask1) M2T = np.sum(mask2) logger.debug("") logger.debug("Total Data: %6d" % T) logger.debug(" LHS OOV: %6d ( %4.1f%% )" % (M1T, M1T*100./T)) logger.debug(" RHS OOV: %6d ( %4.1f%% )" % (M2T, M2T*100./T)) data = data[mask1 & mask2].reset_index(drop=True) F = len(data) logger.debug(" Final: %6d ( %4.1f%% )" % (F, F*100./T)) logger.debug("") logger.debug("Generating %d folds..." % N_FOLDS) # need our folds for cross validation folds = fold.generate_folds_lhs(rng, data, n_folds=N_FOLDS) train_sizes = np.array([len(f[0]) for f in folds], dtype=np.float) test_sizes = np.array([len(f[1]) for f in folds], dtype=np.float) logger.debug("Training sizes: %.1f" % np.mean(train_sizes)) logger.debug(" Test sizes: %.1f" % np.mean(test_sizes)) logger.debug(" Test-Tr ratio: %.1f%%" % np.mean(test_sizes*100./(train_sizes + test_sizes))) logger.debug(" Percent data: %.1f%%" % np.mean((train_sizes + test_sizes)*100./F)) logger.debug("") logger.debug("Setting up the model:") model, features = models.load_setup(args.model) # dwp = data with predictions dwp = data.copy() dwp['prediction'] = False dwp['fold'] = -1 logger.debug("Generating features") X, y = models.generate_feature_matrix(data, space, features) # perform cross validation logger.debug("Performing experiment: %s" % args.experiment) scores = [] for foldno, (train, test) in enumerate(folds): logger.debug(" ... fold %2d/%2d" % (foldno, N_FOLDS)) # generate features train_X, train_y = X[train], y[train] test_X, test_y = X[test], y[test] model.fit(train_X, train_y) preds_y = model.predict(test_X) dwp.loc[test,'prediction'] = preds_y dwp.loc[test,'fold'] = foldno scores.append(metrics.f1_score(test_y, preds_y)) logger.info("F1 across CV: %.3f" % np.mean(scores)) logger.info(" std: %.3f" % np.std(scores)) logger.info(" F1 pooled: %.3f" % metrics.f1_score(dwp['entails'], dwp['prediction'])) dwp.to_csv("%s/exp:%s,data:%s,space:%s,model:%s,seed:%d.csv" % ( args.output, args.experiment, args.data, os.path.basename(args.space), args.model, args.seed ), index=False) if len(dwp[dwp['fold'] == -1]) != 0: logger.error("Some of the data wasn't predicted!\n" + dwp[dwp['fold'] == -1])
def main(): parser = argparse.ArgumentParser('Performs lexical substitution') parser.add_argument('--model', '-m') parser.add_argument('--data', '-d') parser.add_argument('--allvocab', action='store_true') parser.add_argument('--baseline', choices=('oren', 'random', 'ooc', 'oracle', 'orensm', 'baloren')) parser.add_argument('--save') parser.add_argument('--semeval') args = parser.parse_args() if (args.model and args.baseline) or (not args.model and not args.baseline): raise ValueError("Please supply exactly one of model or baseline.") if args.semeval and not args.allvocab: raise ValueError("Need to evaluate on allvocab to output semeval predictions.") if not args.data: raise ValueError("You must specify a data folder") # load the data semeval = LexsubData(args.data) space = utdeftvs.load_numpy("/work/01813/roller/maverick/nnexp/lexsub_embeddings.npz", True) #relations = read_relationships("/work/01813/roller/maverick/nnexp/relations.txt", 1000) #model = ctxpredict.models.get_model("2d", space, len(relations), space.matrix.shape[1]) # load the space #space = utdeftvs.load_numpy("/scratch/cluster/roller/spaces/giga+bnc+uk+wiki2015/output/dependency.svd300.ppmi.250k.1m.npz", True) #space = utdeftvs.load_numpy("/scratch/cluster/roller/spaces/giga+bnc+uk+wiki2015/dependency/output/dependency.w2v500.top250k.top1m.npz", True) #space = utdeftvs.load_numpy("/scratch/cluster/roller/spaces/levy/lexsub_embeddings.npz", True) # need to map our vocabulary to their indices ids, targets, candidates, scores = semeval.generate_matrices(space.lookup) depmat = dependencies_to_indices(semeval.tokens, semeval.parses, space.clookup, space) print "Done preprocessing" if args.allvocab: allvocab_scores = np.zeros((len(targets), len(space.vocab))) for i in xrange(len(targets)): for j in xrange(candidates.shape[1]): c = candidates[i,j] s = scores[i,j] if s > 0: allvocab_scores[i,c] = s allvocab_pred_scores = np.zeros(len(space.vocab)) if args.baseline: print "Computing baseline %s" % args.baseline if args.baseline == 'oren': pred_scores = compute_oren(space, targets, depmat, candidates) if args.allvocab: allvocab_pred_scores = compute_oren_allvocab(space, targets, depmat) elif args.baseline == 'baloren': pred_scores = compute_oren(space, targets, depmat, candidates, balanced=True) if args.allvocab: allvocab_pred_scores = compute_oren_allvocab(space, targets, depmat, balanced=True) elif args.baseline == 'orensm': pred_scores = compute_softmax_oren(space, targets, depmat, candidates) if args.allvocab: allvocab_pred_scores = compute_softmax_oren_allvocab(space, targets, depmat) elif args.baseline == 'ooc': pred_scores = compute_ooc(space, targets, candidates) if args.allvocab: allvocab_pred_scores = compute_ooc_allvocab(space, targets) elif args.baseline == 'random': pred_scores = compute_random(candidates) elif args.baseline == 'oracle': pred_scores = compute_oracle(candidates, scores, space) else: pred_scores = np.zeros(candidates.shape) modelname = "baseline" modelinfo = args.baseline elif args.model: model = my_model_from_json(args.model + "/model.json") filename = sorted(os.listdir(args.model))[-1] my_load_weights(model, "%s/%s" % (args.model, filename)) pred_scores = compute_mymodel(space, targets, model, depmat, candidates) pred_scores = fix_pred_scores(pred_scores, candidates) if args.allvocab: allvocab_pred_scores = compute_mymodel_allwords(space, targets, model, depmat) modelname = args.model modelinfo = filename else: raise ValueError("Not given model or baseline to compute...") # make sure we're not guessing the target, or the empty vector pred_scores = fix_pred_scores(pred_scores, candidates) if args.allvocab: allvocab_pred_scores = fix_allvocab_pred_scores(allvocab_pred_scores, targets) allvocab_pred_scores = fix_lemma_problem(allvocab_pred_scores, targets, space) # compute evaluations gaps = many_gaps(pred_scores, candidates, scores) prec1s = many_prec1(pred_scores, scores) prec3s = prec_at_k(pred_scores, scores, 3) # allvocab is slow; only compute that if we have to if args.allvocab: prec1s_av = many_prec1(allvocab_pred_scores, allvocab_scores) prec3s_av = prec_at_k(allvocab_pred_scores, allvocab_scores, 3) else: prec1s_av = np.zeros(len(targets)) prec3s_av = np.zeros(len(targets)) print ("%s\t%s\t%s\tgap %.3f\tp@1 %.3f\tp@3 %.3f\tp@1av %.3f\tp@3av %.3f" % (args.data, modelname, modelinfo, nanmean(gaps), nanmean(prec1s), nanmean(prec3s), nanmean(prec1s_av), nanmean(prec3s_av))) if args.save: with open(args.save, 'w') as f: f.write('\t'.join(['ident', 'target', 'sentence', 'gold', 'predicted', 'gap', 'p@1', 'p@3', 'p@1av', 'p@3av'])) f.write('\n') for i in xrange(len(semeval.idents)): ident = semeval.idents[i] target = semeval.targets[i] parse = semeval.parses[i] scores_i = scores[i] pred_scores_i = pred_scores[i] candidates_i = candidates[i] gap = gaps[i] prec1 = prec1s[i] prec3 = prec3s[i] prec1av = prec1s_av[i] prec3av = prec3s_av[i] sentence = " ".join(t.word_normed for t in parse.tokens) score_string = " ".join("%s:%3.1f" % (space.vocab[c], s) for c, s in zip(candidates_i, scores_i) if c != 0) pred_string = " ".join("%s:%f" % (space.vocab[c], p) for c, p in revsorted(zip(candidates_i, pred_scores_i)) if c != 0) outline = '\t'.join([str(ident), target, sentence, score_string, pred_string, str(gap), str(prec1), str(prec3), str(prec1av), str(prec3av)]) outline = unidecode(outline) f.write(outline) f.write('\n') if args.semeval: bestf = open(args.semeval + ".best", "w") bootf = open(args.semeval + ".boot", "w") bests = allvocab_pred_scores.argmax(axis=1) boots = np.argpartition(allvocab_pred_scores, -10, 1)[:,-10:] for i in xrange(len(semeval.idents)): ident = semeval.idents[i] ot = semeval.original_targets[i] bestf.write("%s %d :: %s\n" % (ot, ident, space.vocab[bests[i]])) bootf.write("%s %d ::: %s\n" % (ot, ident, ";".join(space.vocab[boots[i]]))) bestf.close() bootf.close()
def main(): parser = argparse.ArgumentParser('description') parser.add_argument('--input', '-i', default='-', help='Input corpus') parser.add_argument('--output', '-o', help='Output numpy file') parser.add_argument('--relations', '-r', help='Relations file') parser.add_argument('--space', '-s', help='Space filename') parser.add_argument('--mindeps', '-m', type=int, default=1, help='Minimum number of attachments to store in matrix.') parser.add_argument('--maxrels', '-M', type=int, default=1000, help='Maximum number of relationships to model.') args = parser.parse_args() space = load_numpy(args.space, insertblank=True) rels = read_relationships(args.relations, args.maxrels) targetids = [] rowids = [] colids = [] datavals = [] num_rows = 0 num_skipped = 0 num_overlap = 0 num_rows_with_overlap = 0 out_counter = 0 rowid = 0 for targetid, relcontexts in pull_items(args.input, space, rels): relcontexts_d = {} for rel, ctx in relcontexts: relcontexts_d[rel] = max(relcontexts_d.get(rel, 0), ctx) if len(relcontexts_d) < args.mindeps: num_skipped += 1 continue num_rows += 1 overlap = len(relcontexts) - len(relcontexts_d) if overlap: num_overlap += overlap num_rows_with_overlap += 1 for rel, ctx in relcontexts_d.iteritems(): rowids.append(rowid) colids.append(rel) datavals.append(ctx) targetids.append(targetid) rowid += 1 # magic number means ~25MB output files, while being able # to be broken into nice 128 row chunks # This is important so that we can keep the memory usage down low later if rowid >= 2097152: print "\nSaving chunk %06d" % out_counter targetoutputs = np.array(targetids, dtype=np.int32) output = scipy.sparse.csr_matrix((datavals, (rowids, colids)), dtype=np.int32) outputname = "%s/chunk_%04d.npz" % (args.output, out_counter) np.savez_compressed(outputname, targets=targetoutputs, contexts=output) del targetoutputs del output rowid = 0 targetids = [] rowids = [] colids = [] datavals = [] out_counter += 1 if targetids: targetoutputs = np.array(targetids) output = scipy.sparse.csr_matrix((datavals, (rowids, colids)), dtype=np.int32) outputname = "%s/chunk_%06d.npz" % (args.output, out_counter) np.savez_compressed(outputname, targets=targetoutputs, contexts=output) print "Number of accepted rows:", num_rows print " Number of skipped rows:", num_skipped print " Number of overlapping:", num_overlap print " Number of rows w/ ovr:", num_rows_with_overlap