Example #1
0
def main():
    # Argument parsing
    parser = argparse.ArgumentParser('Lexical Entailment Classifier')
    parser.add_argument('--data', '-d', help='Input file')
    parser.add_argument('--space', '-s', help='Distributional space')
    parser.add_argument('--model', '-m', help='Model setup', choices=models.SETUPS.keys())
    parser.add_argument('--experiment', '-e', default='standard', choices=('standard', 'random', 'match_error', 'featext', 'strat', 'levy'))
    parser.add_argument('--stratifier')
    parser.add_argument('--output', '-o')
    args = parser.parse_args()

    logger.debug('Lexent Arguments: ')
    logger.debug(args)

    # Steps that are the same regardless of experiments
    logger.debug("Loading space")
    nn_space = load_numpy(args.space, insertblank=True)
    space = nn_space.normalize()

    # Handle vocabulary issues
    logger.debug("Reading data")
    data = load_data(args.data, space)

    logger.debug("         Model: %s" % args.model)
    model, features, hyper = models.load_setup(args.model)

    logger.debug("      Features: %s" % features)
    X, y = models.generate_feature_matrix(data, space, features)

    if args.experiment == 'standard':
        standard_experiment(data, X, y, model, hyper, args)
    elif args.experiment == 'featext':
        feature_extraction(X, y, model, space, data)
Example #2
0
def main():
    parser = argparse.ArgumentParser('description')
    parser.add_argument('--logfolder', '-l', help='Log folder.')
    parser.add_argument('--csvfolder', '-c', help='Output CSV folder for graphs.')
    parser.add_argument('--output', '-o', help='Folder for saving output models.')
    parser.add_argument('--model', '-m', help='Selects a particular model.')
    parser.add_argument('--maxbatches', '-B', default=0, type=int, help='Maximum number of batches to process (in thousands).')
    parser.add_argument('--batchsize', '-b', type=int, default=BATCH_SIZE, help='Batch size')
    parser.add_argument('--dimensions', '-d', type=int, default=0, help='Number of dimensions from the space to use. If 0 (default), use all.')
    parser.add_argument('--learningrate', '-r', type=float, default=LEARNING_RATE, help='Learning rate')
    args = parser.parse_args()

    logger.debug("Reading distributional space '%s'" % SPACE_FILENAME)
    space = load_numpy(SPACE_FILENAME, insertblank=True)
    if args.dimensions:
        space.matrix = space.matrix[:,:args.dimensions]
    if True:
        m = space.matrix
        norm_mean = m[1:].mean(axis=0)
        norm_std = (m[1:].std(axis=0) * 10)
        m = (m - norm_mean) / norm_std
        m[0] = 0
        space.matrix = m
    #space = space.normalize()
    logger.debug("Finished reading space")
    logger.debug("Space contains %d words with %d dimensions each." % space.matrix.shape)

    cbr = CorpusBatchReader(CORPUS_FOLDER, space, batch_size=args.batchsize)
    data_iterator = DataIterator(cbr, epochs=1, maxbatches=args.maxbatches * 1000)

    HIDDEN = space.matrix.shape[1]

    logger.debug("Compiling compute graph")
    R = data_iterator.test[0].shape[1]
    model = models.get_model(args.model, space, R, HIDDEN, args.learningrate)

    modelinfo = {
        'model': args.model,
        'learningrate': args.learningrate,
        'hidden': HIDDEN,
        'space': SPACE_FILENAME,
        'dimensions': space.matrix.shape[1],
    }

    filename = _generate_filename(modelinfo)
    csvlog = CSVLogger(os.path.join(args.csvfolder, filename + ".csv"))

    logger.debug("Compilation finished")
    if DEBUG:
        logger.debug("Theano compute graph:\n" + debugprint(model._train.maker.fgraph.outputs[0], file='str'))

    logger.debug("Starting training")
    start_time = datetime.now()
    for X, Y in data_iterator:
        trainscore = model.train_on_batch(X, Y)

        if data_iterator.batch % 1000 == 0:
            valscore = model.evaluate(*data_iterator.val, verbose=False)
            testscore = model.evaluate(*data_iterator.test, verbose=False)
            progress = data_iterator.progress()
            elapsed = (datetime.now() - start_time)
            rank = intrinsic_eval(model, space, data_iterator.test[0], data_iterator.test[1])
            #rank = 0.0
            eta = _compute_eta(start_time, progress)
            batchinfo = dict(
                epoch=data_iterator.epoch,
                kbatch=data_iterator.batch/1000,
                trainscore=trainscore,
                valscore=valscore,
                testscore=testscore,
                intrinsic=rank,
                progress=100 * progress,
                elapsed=elapsed.total_seconds(),
                eta=eta
            )
            info = _dictmerge(batchinfo, modelinfo)
            logger.debug("%(epoch)3d ep %(kbatch)8d Kba %(intrinsic)6.4f / %(valscore)8.5f / %(testscore)8.5f [%(progress)5.1f%% eta %(eta)s]" % info)
            del info['eta']
            csvlog.append(info)

        if data_iterator.batch % 5000 == 0:
            checkpoint_filename = os.path.join(args.output, "%s__batch%08d.hd5" % (filename, data_iterator.batch))
            logger.debug("Checkpointing model to %s" % checkpoint_filename)
            model.save_weights(checkpoint_filename, overwrite=True)
    parser.add_argument('--output')
    parser.add_argument('--detailed', action='store_true')

    ARGS = parser.parse_args()

    sys.stderr.write("Arguments:\n")
    for k, v in ARGS.__dict__.iteritems():
        sys.stderr.write("    %s: %s\n" % (k, v))
    sys.stderr.write("\n")

    sys.stderr.write("Using data file: %s\n" % DATA_FILE)
    sys.stderr.write("Using test file: %s\n" % TEST_FILE)

    sys.stderr.write("Loading spaces...\n")
    bow_space = utdeftvs.load_numpy(WINDOW_SPACE)
    dep_space = utdeftvs.load_numpy(DEP_SPACE, True)

    sys.stderr.write("Reading data...\n")
    data = pd.read_table(DATA_FILE)
    test_data = pd.read_table(TEST_FILE)

    sys.stderr.write("Parsing...\n")
    data['corenlp_left'] = parse_sentences(data['text'])
    data['corenlp_right'] = parse_sentences(data['hypothesis'])
    test_data['corenlp_left'] = parse_sentences(test_data['text'])
    test_data['corenlp_right'] = parse_sentences(test_data['hypothesis'])

    if ARGS.solo:
        # only look at items that need a single rule
        pairCounts = Counter(data['pairIndex'])
Example #4
0
def main():
    parser = argparse.ArgumentParser('Lexical Entailment Classifier')
    parser.add_argument('--data', '-d', help='Input file')
    parser.add_argument('--space', '-s', help='Distributional space')
    parser.add_argument('--seed', '-S', default=1, type=int, help='Random seed')
    parser.add_argument('--model', '-m', help='Model setup', choices=models.SETUPS.keys())
    parser.add_argument('--experiment', '-e', default='standard', choices=('standard', 'match_error'))
    parser.add_argument('--output', '-o', default='results')
    args = parser.parse_args()

    logger.debug('Lexent Arguments: ')
    logger.debug(args)

    rng = np.random.RandomState(args.seed)

    logger.debug("Loading space")
    space = load_numpy(args.space).normalize()

    # Handle vocabulary issues
    logger.debug("Reading data")
    data = pd.read_table("data/%s/data.tsv" % args.data, header=None, names=('word1', 'word2', 'entails'))
    data['word1'] = data['word1'].apply(lambda x: x.lower() + '/NN')
    data['word2'] = data['word2'].apply(lambda x: x.lower() + '/NN')

    mask1 = data.word1.apply(lambda x: x in space.lookup)
    mask2 = data.word2.apply(lambda x: x in space.lookup)


    T = len(data)
    M1T = np.sum(mask1)
    M2T = np.sum(mask2)
    logger.debug("")
    logger.debug("Total Data: %6d" % T)
    logger.debug("   LHS OOV: %6d ( %4.1f%% )" % (M1T, M1T*100./T))
    logger.debug("   RHS OOV: %6d ( %4.1f%% )" % (M2T, M2T*100./T))
    data = data[mask1 & mask2].reset_index(drop=True)
    F = len(data)
    logger.debug("     Final: %6d ( %4.1f%% )" % (F, F*100./T))
    logger.debug("")

    logger.debug("Generating %d folds..." % N_FOLDS)

    # need our folds for cross validation
    folds = fold.generate_folds_lhs(rng, data, n_folds=N_FOLDS)
    train_sizes = np.array([len(f[0]) for f in folds], dtype=np.float)
    test_sizes = np.array([len(f[1]) for f in folds], dtype=np.float)

    logger.debug("Training sizes: %.1f" % np.mean(train_sizes))
    logger.debug("    Test sizes: %.1f" % np.mean(test_sizes))
    logger.debug(" Test-Tr ratio: %.1f%%" % np.mean(test_sizes*100./(train_sizes + test_sizes)))
    logger.debug(" Percent data: %.1f%%" % np.mean((train_sizes + test_sizes)*100./F))
    logger.debug("")

    logger.debug("Setting up the model:")
    model, features = models.load_setup(args.model)


    # dwp = data with predictions
    dwp = data.copy()
    dwp['prediction'] = False
    dwp['fold'] = -1

    logger.debug("Generating features")
    X, y = models.generate_feature_matrix(data, space, features)

    # perform cross validation
    logger.debug("Performing experiment: %s" % args.experiment)
    scores = []
    for foldno, (train, test) in enumerate(folds):
        logger.debug("   ... fold %2d/%2d" % (foldno, N_FOLDS))
        # generate features
        train_X, train_y = X[train], y[train]
        test_X, test_y = X[test], y[test]

        model.fit(train_X, train_y)
        preds_y = model.predict(test_X)
        dwp.loc[test,'prediction'] = preds_y
        dwp.loc[test,'fold'] = foldno
        scores.append(metrics.f1_score(test_y, preds_y))

    logger.info("F1 across CV: %.3f" % np.mean(scores))
    logger.info("         std: %.3f" % np.std(scores))
    logger.info("   F1 pooled: %.3f" % metrics.f1_score(dwp['entails'], dwp['prediction']))


    dwp.to_csv("%s/exp:%s,data:%s,space:%s,model:%s,seed:%d.csv" % (
        args.output, args.experiment, args.data,
        os.path.basename(args.space),
        args.model, args.seed
        ), index=False)

    if len(dwp[dwp['fold'] == -1]) != 0:
        logger.error("Some of the data wasn't predicted!\n" +
                     dwp[dwp['fold'] == -1])
Example #5
0
def main():
    parser = argparse.ArgumentParser('Performs lexical substitution')
    parser.add_argument('--model', '-m')
    parser.add_argument('--data', '-d')
    parser.add_argument('--allvocab', action='store_true')
    parser.add_argument('--baseline', choices=('oren', 'random', 'ooc', 'oracle', 'orensm', 'baloren'))
    parser.add_argument('--save')
    parser.add_argument('--semeval')
    args = parser.parse_args()

    if (args.model and args.baseline) or (not args.model and not args.baseline):
        raise ValueError("Please supply exactly one of model or baseline.")
    if args.semeval and not args.allvocab:
        raise ValueError("Need to evaluate on allvocab to output semeval predictions.")

    if not args.data:
        raise ValueError("You must specify a data folder")

    # load the data
    semeval = LexsubData(args.data)
    space = utdeftvs.load_numpy("/work/01813/roller/maverick/nnexp/lexsub_embeddings.npz", True)
    #relations = read_relationships("/work/01813/roller/maverick/nnexp/relations.txt", 1000)
    #model = ctxpredict.models.get_model("2d", space, len(relations), space.matrix.shape[1])
    # load the space
    #space = utdeftvs.load_numpy("/scratch/cluster/roller/spaces/giga+bnc+uk+wiki2015/output/dependency.svd300.ppmi.250k.1m.npz", True)
    #space = utdeftvs.load_numpy("/scratch/cluster/roller/spaces/giga+bnc+uk+wiki2015/dependency/output/dependency.w2v500.top250k.top1m.npz", True)
    #space = utdeftvs.load_numpy("/scratch/cluster/roller/spaces/levy/lexsub_embeddings.npz", True)
    # need to map our vocabulary to their indices
    ids, targets, candidates, scores = semeval.generate_matrices(space.lookup)
    depmat = dependencies_to_indices(semeval.tokens, semeval.parses, space.clookup, space)
    print "Done preprocessing"


    if args.allvocab:
        allvocab_scores = np.zeros((len(targets), len(space.vocab)))
        for i in xrange(len(targets)):
            for j in xrange(candidates.shape[1]):
                c = candidates[i,j]
                s = scores[i,j]
                if s > 0:
                    allvocab_scores[i,c] = s
    allvocab_pred_scores = np.zeros(len(space.vocab))
    if args.baseline:
        print "Computing baseline %s" % args.baseline
        if args.baseline == 'oren':
            pred_scores = compute_oren(space, targets, depmat, candidates)
            if args.allvocab:
                allvocab_pred_scores = compute_oren_allvocab(space, targets, depmat)
        elif args.baseline == 'baloren':
            pred_scores = compute_oren(space, targets, depmat, candidates, balanced=True)
            if args.allvocab:
                allvocab_pred_scores = compute_oren_allvocab(space, targets, depmat, balanced=True)
        elif args.baseline == 'orensm':
            pred_scores = compute_softmax_oren(space, targets, depmat, candidates)
            if args.allvocab:
                allvocab_pred_scores = compute_softmax_oren_allvocab(space, targets, depmat)
        elif args.baseline == 'ooc':
            pred_scores = compute_ooc(space, targets, candidates)
            if args.allvocab:
                allvocab_pred_scores = compute_ooc_allvocab(space, targets)
        elif args.baseline == 'random':
            pred_scores = compute_random(candidates)
        elif args.baseline == 'oracle':
            pred_scores = compute_oracle(candidates, scores, space)
        else:
            pred_scores = np.zeros(candidates.shape)
        modelname = "baseline"
        modelinfo = args.baseline
    elif args.model:
        model = my_model_from_json(args.model + "/model.json")
        filename = sorted(os.listdir(args.model))[-1]
        my_load_weights(model, "%s/%s" % (args.model, filename))

        pred_scores = compute_mymodel(space, targets, model, depmat, candidates)
        pred_scores = fix_pred_scores(pred_scores, candidates)
        if args.allvocab:
            allvocab_pred_scores = compute_mymodel_allwords(space, targets, model, depmat)
        modelname = args.model
        modelinfo = filename
    else:
        raise ValueError("Not given model or baseline to compute...")

    # make sure we're not guessing the target, or the empty vector
    pred_scores = fix_pred_scores(pred_scores, candidates)
    if args.allvocab:
        allvocab_pred_scores = fix_allvocab_pred_scores(allvocab_pred_scores, targets)
        allvocab_pred_scores = fix_lemma_problem(allvocab_pred_scores, targets, space)

    # compute evaluations
    gaps = many_gaps(pred_scores, candidates, scores)
    prec1s = many_prec1(pred_scores, scores)
    prec3s = prec_at_k(pred_scores, scores, 3)
    # allvocab is slow; only compute that if we have to
    if args.allvocab:
        prec1s_av = many_prec1(allvocab_pred_scores, allvocab_scores)
        prec3s_av = prec_at_k(allvocab_pred_scores, allvocab_scores, 3)
    else:
        prec1s_av = np.zeros(len(targets))
        prec3s_av = np.zeros(len(targets))

    print ("%s\t%s\t%s\tgap %.3f\tp@1 %.3f\tp@3 %.3f\tp@1av %.3f\tp@3av %.3f" %
            (args.data, modelname, modelinfo,
             nanmean(gaps), nanmean(prec1s), nanmean(prec3s), nanmean(prec1s_av), nanmean(prec3s_av)))

    if args.save:
        with open(args.save, 'w') as f:
            f.write('\t'.join(['ident', 'target', 'sentence', 'gold', 'predicted', 'gap', 'p@1', 'p@3', 'p@1av', 'p@3av']))
            f.write('\n')
            for i in xrange(len(semeval.idents)):
                ident = semeval.idents[i]
                target = semeval.targets[i]
                parse = semeval.parses[i]
                scores_i = scores[i]
                pred_scores_i = pred_scores[i]
                candidates_i = candidates[i]
                gap = gaps[i]
                prec1 = prec1s[i]
                prec3 = prec3s[i]
                prec1av = prec1s_av[i]
                prec3av = prec3s_av[i]

                sentence = " ".join(t.word_normed for t in parse.tokens)

                score_string = " ".join("%s:%3.1f" % (space.vocab[c], s) for c, s in zip(candidates_i, scores_i) if c != 0)
                pred_string = " ".join("%s:%f" % (space.vocab[c], p) for c, p in revsorted(zip(candidates_i, pred_scores_i)) if c != 0)
                outline = '\t'.join([str(ident), target, sentence, score_string, pred_string, str(gap), str(prec1), str(prec3), str(prec1av), str(prec3av)])
                outline = unidecode(outline)
                f.write(outline)
                f.write('\n')
    if args.semeval:
        bestf = open(args.semeval + ".best", "w")
        bootf = open(args.semeval + ".boot", "w")
        bests = allvocab_pred_scores.argmax(axis=1)
        boots = np.argpartition(allvocab_pred_scores, -10, 1)[:,-10:]
        for i in xrange(len(semeval.idents)):
            ident = semeval.idents[i]
            ot = semeval.original_targets[i]
            bestf.write("%s %d :: %s\n" % (ot, ident, space.vocab[bests[i]]))
            bootf.write("%s %d ::: %s\n" % (ot, ident, ";".join(space.vocab[boots[i]])))
        bestf.close()
        bootf.close()
def main():
    parser = argparse.ArgumentParser('description')
    parser.add_argument('--input', '-i', default='-', help='Input corpus')
    parser.add_argument('--output', '-o', help='Output numpy file')
    parser.add_argument('--relations', '-r', help='Relations file')
    parser.add_argument('--space', '-s', help='Space filename')
    parser.add_argument('--mindeps', '-m', type=int, default=1,
                        help='Minimum number of attachments to store in matrix.')
    parser.add_argument('--maxrels', '-M', type=int, default=1000,
                        help='Maximum number of relationships to model.')
    args = parser.parse_args()

    space = load_numpy(args.space, insertblank=True)
    rels = read_relationships(args.relations, args.maxrels)

    targetids = []

    rowids = []
    colids = []
    datavals = []

    num_rows = 0
    num_skipped = 0
    num_overlap = 0
    num_rows_with_overlap = 0

    out_counter = 0

    rowid = 0
    for targetid, relcontexts in pull_items(args.input, space, rels):
        relcontexts_d = {}
        for rel, ctx in relcontexts:
            relcontexts_d[rel] = max(relcontexts_d.get(rel, 0), ctx)

        if len(relcontexts_d) < args.mindeps:
            num_skipped += 1
            continue

        num_rows += 1
        overlap = len(relcontexts) - len(relcontexts_d)
        if overlap:
            num_overlap += overlap
            num_rows_with_overlap += 1

        for rel, ctx in relcontexts_d.iteritems():
            rowids.append(rowid)
            colids.append(rel)
            datavals.append(ctx)
        targetids.append(targetid)
        rowid += 1

        # magic number means ~25MB output files, while being able
        # to be broken into nice 128 row chunks
        # This is important so that we can keep the memory usage down low later
        if rowid >= 2097152:
            print "\nSaving chunk %06d" % out_counter
            targetoutputs = np.array(targetids, dtype=np.int32)
            output = scipy.sparse.csr_matrix((datavals, (rowids, colids)), dtype=np.int32)
            outputname = "%s/chunk_%04d.npz" % (args.output, out_counter)
            np.savez_compressed(outputname, targets=targetoutputs, contexts=output)
            del targetoutputs
            del output
            rowid = 0
            targetids = []
            rowids = []
            colids = []
            datavals = []
            out_counter += 1

    if targetids:
        targetoutputs = np.array(targetids)
        output = scipy.sparse.csr_matrix((datavals, (rowids, colids)), dtype=np.int32)
        outputname = "%s/chunk_%06d.npz" % (args.output, out_counter)
        np.savez_compressed(outputname, targets=targetoutputs, contexts=output)

    print "Number of accepted rows:", num_rows
    print " Number of skipped rows:", num_skipped
    print "  Number of overlapping:", num_overlap
    print "  Number of rows w/ ovr:", num_rows_with_overlap