예제 #1
0
파일: evaluate.py 프로젝트: moreymat/attelo
def _load_harness_multipack(hconf, test_data=False):
    """Load the multipack for our current configuration.

    Load the stripped features file if we don't actually need to
    use the features (this would only make sense on the cluster
    where evaluation is broken up into separate stages that we
    can fire on different nodes)

    Parameters
    ----------
    test_data : bool, defaults to False
        If True, it's test data we wanted.

    Returns
    -------
    mpack : Multipack
        Multipack loaded from the harness' configuration.
    """
    stripped_paths = hconf.mpack_paths(test_data, stripped=True)
    if (hconf.runcfg.stage in [ClusterStage.end, ClusterStage.start]
            and fp.exists(stripped_paths['features'])):
        paths = stripped_paths
    else:
        paths = hconf.mpack_paths(test_data, stripped=False)
    mpack = load_multipack(
        paths['edu_input'],
        paths['pairings'],
        paths['features'],
        paths['vocab'],
        corpus_path=paths.get('corpus', None),  # WIP
        verbose=True)
    return mpack
예제 #2
0
파일: evaluate.py 프로젝트: kowey/attelo
def _load_harness_multipack(hconf, test_data=False):
    """
    Load the multipack for our current configuration.

    Load the stripped features file if we don't actually need to
    use the features (this would only make sense on the cluster
    where evaluation is broken up into separate stages that we
    can fire on different nodes)

    Parameters
    ----------
    test_data: bool

    Returns
    -------
    mpack: Multipack
    """
    stripped_paths = hconf.mpack_paths(test_data, stripped=True)
    if (hconf.runcfg.stage in [ClusterStage.end, ClusterStage.start] and
            fp.exists(stripped_paths[2])):
        paths = stripped_paths
    else:
        paths = hconf.mpack_paths(test_data, stripped=False)
    return load_multipack(paths[0],
                          paths[1],
                          paths[2],
                          paths[3],
                          verbose=True)
예제 #3
0
파일: util.py 프로젝트: moreymat/attelo
def load_args_multipack(args):
    '''
    Load multipack specified via command line arguments
    '''
    return load_multipack(args.edus,
                          args.pairings,
                          args.features,
                          args.vocab,
                          verbose=not args.quiet)
예제 #4
0
파일: model.py 프로젝트: moreymat/irit-stac
def _do_corpus(hconf):
    "Run evaluation on a corpus"
    paths = hconf.mpack_paths(test_data=False)
    if not fp.exists(paths[0]):
        exit_ungathered()
    mpack = load_multipack(paths[0], paths[1], paths[2], paths[3], verbose=True)
    dconf = DataConfig(pack=mpack, folds=None)
    # (re)learn combined model (we shouldn't assume
    # it's in some scratch directory)
    for econf in hconf.evaluations:
        learn(hconf, econf, dconf, None)
    _mk_dialogue_act_model(hconf)
예제 #5
0
def decode(lconf, evaluations):
    "Decode the input using all the model/learner combos we know"

    fpath = minicorpus_path(lconf) + '.relations.sparse'
    vocab_path = lconf.mpack_paths(test_data=False)[3]
    mpack = load_multipack(fpath + '.edu_input',
                           fpath + '.pairings',
                           fpath,
                           vocab_path)
    decoder_jobs = concat_i(_get_decoding_jobs(mpack, lconf, econf)
                            for econf in evaluations)
    Parallel(n_jobs=lconf.runcfg.n_jobs, verbose=True)(decoder_jobs)
    for econf in evaluations:
        output_path = attelo_result_path(lconf, econf)
        ath_parse.concatenate_outputs(mpack, output_path)
예제 #6
0
def _do_corpus(hconf):
    "Run evaluation on a corpus"
    paths = hconf.mpack_paths(test_data=False)
    if not fp.exists(paths['edu_input']):
        exit_ungathered()
    mpack = load_multipack(paths['edu_input'],
                           paths['pairings'],
                           paths['features'],
                           paths['vocab'],
                           verbose=True)
    dconf = DataConfig(pack=mpack, folds=None)
    # (re)learn combined model (we shouldn't assume
    # it's in some scratch directory)
    for econf in hconf.evaluations:
        learn(hconf, econf, dconf, None)
    _mk_dialogue_act_model(hconf)
예제 #7
0
def decode(lconf, evaluations):
    """Decode the input using all the model/learner combos we know.

    Parameters
    ----------
    lconf : ?
        TODO

    evaluations : iterable of ?
        TODO
    """

    fpath = minicorpus_path(lconf) + '.relations.sparse'
    vocab_path = lconf.mpack_paths(test_data=False)['vocab']
    mpack = load_multipack(fpath + '.edu_input', fpath + '.pairings', fpath,
                           vocab_path)
    decoder_jobs = concat_i(
        _get_decoding_jobs(mpack, lconf, econf) for econf in evaluations)
    Parallel(n_jobs=lconf.runcfg.n_jobs, verbose=True)(decoder_jobs)
    for econf in evaluations:
        output_path = attelo_result_path(lconf, econf)
        ath_parse.concatenate_outputs(mpack, output_path)
예제 #8
0
def _load_harness_multipack(hconf, test_data=False):
    """
    Load the multipack for our current configuration.

    Load the stripped features file if we don't actually need to
    use the features (this would only make sense on the cluster
    where evaluation is broken up into separate stages that we
    can fire on different nodes)

    Parameters
    ----------
    test_data: bool

    Returns
    -------
    mpack: Multipack
    """
    stripped_paths = hconf.mpack_paths(test_data, stripped=True)
    if (hconf.runcfg.stage in [ClusterStage.end, ClusterStage.start]
            and fp.exists(stripped_paths[2])):
        paths = stripped_paths
    else:
        paths = hconf.mpack_paths(test_data, stripped=False)
    return load_multipack(paths[0], paths[1], paths[2], paths[3], verbose=True)
예제 #9
0
from attelo.score import (score_edges)
from attelo.table import (DataPack)
from attelo.util import (mk_rng, Team)

# pylint: disable=invalid-name

WORKING_DIR = 'doc/example-corpus'
PREFIX = fp.join(WORKING_DIR, 'tiny')
TMP_OUTPUT = '/tmp/mini-evaluate'
if not fp.exists(TMP_OUTPUT):
    os.makedirs(TMP_OUTPUT)

# load the data
mpack = load_multipack(PREFIX + '.edus',
                       PREFIX + '.pairings',
                       PREFIX + '.features.sparse',
                       PREFIX + '.features.sparse.vocab',
                       verbose=True)

# divide the dataset into folds
num_folds = min((10, len(mpack)))
fold_dict = make_n_fold(mpack, num_folds, mk_rng())

# select a decoder and a learner team
decoder = MstDecoder(root_strategy=MstRootStrategy.fake_root)
learners = Team(attach=SklearnAttachClassifier(LogisticRegression()),
                label=SklearnLabelClassifier(LogisticRegression()))

# put them together as a parser
parser = JointPipeline(learner_attach=learners.attach,
                       learner_label=learners.label,
예제 #10
0
from attelo.score import (score_edges)
from attelo.table import (DataPack)
from attelo.util import (mk_rng, Team)

# pylint: disable=invalid-name

WORKING_DIR = 'doc/example-corpus'
PREFIX = fp.join(WORKING_DIR, 'tiny')
TMP_OUTPUT = '/tmp/mini-evaluate'
if not fp.exists(TMP_OUTPUT):
    os.makedirs(TMP_OUTPUT)

# load the data
mpack = load_multipack(PREFIX + '.edus',
                       PREFIX + '.pairings',
                       PREFIX + '.features.sparse',
                       PREFIX + '.features.sparse.vocab',
                       verbose=True)

# divide the dataset into folds
num_folds = min((10, len(mpack)))
fold_dict = make_n_fold(mpack, num_folds, mk_rng())

# select a decoder and a learner team
decoder = MstDecoder(root_strategy=MstRootStrategy.fake_root)
learners = Team(attach=SklearnAttachClassifier(LogisticRegression()),
                label=SklearnLabelClassifier(LogisticRegression()))

# put them together as a parser
parser = JointPipeline(learner_attach=learners.attach,
                       learner_label=learners.label,
예제 #11
0
def main(args):
    """
    Load tiny corpus, extract features and labels, and learn the presence of semantic relation

    Parameters
    ----------
    args : not used yet (but necessary)
    """

    # load the data into a multipack
    mpackTrain = load_multipack(TRAIN_PREFIX + '.relations.sparse.edu_input',
                                TRAIN_PREFIX + '.relations.sparse.pairings',
                                TRAIN_PREFIX + '.relations.sparse',
                                TRAIN_PREFIX + '.relations.sparse.vocab',
                                verbose=True)

    mpackTest = load_multipack(TEST_PREFIX + '.relations.sparse.edu_input',
                               TEST_PREFIX + '.relations.sparse.pairings',
                               TEST_PREFIX + '.relations.sparse',
                               TEST_PREFIX + '.relations.sparse.vocab',
                               verbose=True)

    modelRidge = linear_model.Ridge
    modelLogistic = linear_model.LogisticRegression
    modelLinear = linear_model.LinearRegression
    modelRandomForestRegressor = ensemble.RandomForestRegressor
    modelGradientBoost = ensemble.GradientBoostingClassifier
    modelGradientBoostRegressor = ensemble.GradientBoostingRegressor

    decoderMST = MstDecoder(MstRootStrategy.fake_root, use_prob=False)
    decoder = ILPDecoder()

    label = collectTargetLabels(mpackTest)

    LAM = np.logspace(np.log10(6e-2), np.log10(.5), num=5)

    for lam in LAM:
        print("------------------------------------------------------")
        print("Lambda =", lam)
	# create learner
        learner = Learner(decoder=decoderMST, decoder2=decoder, Lambda=lam)

        s, w, dualGap = learner.fit(mpackTrain)

        arbre = predict(learner, mpackTrain.values()[5], w)
        print()

        print(arbre.graph[0])

        dpack2 = learner.multiply(mpackTrain.values()[5], attach=1 * (mpackTrain.values()[5].target != 2))

        targetpack = learner.decoder_.decode(dpack2)

        target = targetpack.graph[0]
        print(target)
        print(arbre.target)

        plt.plot(s, label="score")
        plt.plot(dualGap, label="duality gap")
        plt.plot(np.asarray(s) - np.asarray(dualGap), label="difference")

        plt.title("minimization")
        plt.xlabel('k')
        plt.ylabel('score')
        plt.grid(True)
        plt.legend(loc=1, borderaxespad=0.)

        plt.savefig(FIGURES_FOLDER + 'test'+str(round(lam, 5))+'.pdf')
        plt.close()

        np.savetxt('w_'+str(round(lam, 5))+'.out', w, delimiter=',')

        # test = np.loadtxt('test.out', delimiter=',')

        print()
        res = learner.predict_score(mpackTest)
        pred = []
        for graph in res:
            pred.extend(graph.graph[0] != 2)

        vote = []
        vote.append(1 * pred)

        print("f1 score =", round(metrics.f1_score(label, 1 * pred), 2))

        P = metrics.precision_score(label, pred)
        R = metrics.recall_score(label, pred)

        print("Precision =", round(P, 2))
        print("Recall =", round(R, 2))
        print()
        message("IRIT-STAC", "Travail termine")
    exit(1)

    """
    learner = SklearnAttachClassifier(modelLogistic())
    decoder = MstDecoder(MstRootStrategy.fake_root)
    parser1 = AttachPipeline(learner=learner,
                             decoder=decoder)

    # train the parser
    train_dpacks = mpackTrain.values()
    train_targets = [x.target for x in train_dpacks]
    parser1.fit(train_dpacks, train_targets)

    # now run on a test pack
    dpack = []
    for i in range(len(mpackTest.values())):
        dpack.extend(parser1.transform(mpackTest.values()[i]))
    # print_results(dpack)"""

    """
    # Clean features with high correlation or low variance
    featsTrain_filtered_1, featsTest_filtered_1 = cleanFeatures(featsTrain, featsTest,
                                                                fileName=FIGURES_FOLDER + 'minFilter.pdf')
    featsTrain_filtered_2, featsTest_filtered_2 = cleanFeatures(featsTrain, featsTest, maxCorrelation=0.98,
                                                                minVariance=0.005,
                                                                fileName=FIGURES_FOLDER + 'maxFilter.pdf')
    featsTrain_filtered_3, featsTest_filtered_3 = cleanFeatures(featsTrain, featsTest, maxCorrelation=0.97,
                                                                fileName=FIGURES_FOLDER + 'filter.pdf')"""

    # getAUPR(mpackTrain, mpackTest, modelRidge, 1e3, 1e4, 50, fileName='noFilter_ridge_2_50pts.pdf', folder=FIGURES_FOLDER)
    # exit(1)
    # getAUPR(mpackTrain, mpackTest, modelLogistic, 1e-3, 1e-1, 50, fileName='noFilter_3_50pts.pdf', folder=FIGURES_FOLDER, param='C', proba=True)
    # exit(1)


    ########################################################################
    with Torpor("Learning and Scoring Linear Regression Ridge"):
        learner = Learner(learner=modelRidge, alpha=3500)
        learner.fit(mpackTrain)

        scores = learner.predict_score(mpackTest)

        label = collectTargetLabels(mpackTest)

        precision, recall, seuil = metrics.precision_recall_curve(label, scores)

        f1_score = [0]
        for s in seuil:
            f1 = metrics.f1_score(label, 1 * (scores > s))
            f1_score.append(f1)

        precision = [x for (y, x) in sorted(zip(recall, precision))]
        f1_score = [x for (y, x) in sorted(zip(recall, f1_score))]
        recall.sort()
        # plt.plot(recall, precision)
        area = metrics.auc(recall, precision)
        print("Area =", area)

        plt.plot(recall, precision, label="PR")
        plt.plot(recall, f1_score, label="F1 score")
        precision = np.asarray(precision)
        recall = np.asarray(recall)

        print("max f1_score =", max(f1_score))
        print("max precision =", precision[np.argwhere(f1_score == max(f1_score))])
        print("max recall =", recall[np.argwhere(f1_score == max(f1_score))])

        res = learner.predict_score(mpackTest, decoder=decoder)
        pred = []
        for graph in res:
            pred.extend(graph.graph[0] != 2)

        vote = []
        vote.append(1 * pred)

        plt.title("Linear Regression Ridge AUPR=0.41\n F1 score with decoder :" + str(
            round(metrics.f1_score(label, 1 * pred), 2)))
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.grid(True)

        P = metrics.precision_score(label, pred)
        R = metrics.recall_score(label, pred)

        R_not_decoded = min(recall, key=lambda x: abs(x - R))
        P_not_decoded = max(precision[np.argwhere(recall == R_not_decoded)])
        print("Precision =", P)
        print("Recall =", R)
        print("Precision before decoding =", P_not_decoded)

    plt.plot(R, P, 'ro')

    plt.annotate('with decoder', xy=(R + .005, P + .005), xytext=(.8, .7),
                 arrowprops=dict(facecolor='black', shrink=0.02))

    plt.legend(loc=1, borderaxespad=0.)

    plt.savefig(FIGURES_FOLDER + "Ridge.pdf")
    plt.close()

    ##########################################################################################
    with Torpor("Learning and Scoring Linear Regression"):
        learner._instantiateLearner(modelLinear)
        learner.fit(mpackTrain)

        scores = learner.predict_score(mpackTest)

        label = collectTargetLabels(mpackTest)

        precision, recall, seuil = metrics.precision_recall_curve(label, scores)

        f1_score = [0]
        for s in seuil:
            f1_score.extend([metrics.f1_score(label, 1 * (scores > s))])

        precision = [x for (y, x) in sorted(zip(recall, precision))]
        f1_score = [x for (y, x) in sorted(zip(recall, f1_score))]
        recall.sort()
        # plt.plot(recall, precision)
        area = metrics.auc(recall, precision)
        print("Area =", area)

        plt.plot(recall, precision, label="PR")
        plt.plot(recall, f1_score, label="F1 score")
        precision = np.asarray(precision)
        recall = np.asarray(recall)

        print("max f1_score =", max(f1_score))
        print("max precision =", precision[np.argwhere(f1_score == max(f1_score))])
        print("max recall =", recall[np.argwhere(f1_score == max(f1_score))])

        res = learner.predict_score(mpackTest, decoder=decoder)
        pred = []

        for graph in res:
            pred.extend(graph.graph[0] != 2)

        vote.append(1 * pred)

        plt.title(
            "Linear Regression AUPR=0.33\n F1 score with decoder :" + str(round(metrics.f1_score(label, 1 * pred), 2)))
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.grid(True)

    P = metrics.precision_score(label, pred)
    R = metrics.recall_score(label, pred)

    R_not_decoded = min(recall, key=lambda x: abs(x - R))
    P_not_decoded = max(precision[np.argwhere(recall == R_not_decoded)])
    print("Precision =", P)
    print("Recall =", R)
    print("Precision before decoding =", P_not_decoded)
    plt.plot(R, P, 'ro')

    plt.annotate('with decoder', xy=(R + .005, P + .005), xytext=(.8, .7),
                 arrowprops=dict(facecolor='black', shrink=0.02))

    plt.legend(loc=1, borderaxespad=0.)
    plt.savefig(FIGURES_FOLDER + "RegLinear.pdf")
    plt.close()

    #############################################################################################
    with Torpor("Learning and Scoring Logistic Regression"):
        learner = Learner(learner=modelLogistic)
        learner.fit(mpackTrain)

        scores = learner.predict_score(mpackTest, proba=True)

        label = collectTargetLabels(mpackTest)

        precision, recall, seuil = metrics.precision_recall_curve(label, scores)

        f1_score = [0]
        for s in seuil:
            f1_score.extend([metrics.f1_score(label, 1 * (scores > s))])

        precision = [x for (y, x) in sorted(zip(recall, precision))]
        f1_score = [x for (y, x) in sorted(zip(recall, f1_score))]
        recall.sort()
        # plt.plot(recall, precision)
        area = metrics.auc(recall, precision)
        print("Area =", area)

        plt.plot(recall, precision, label="PR")
        plt.plot(recall, f1_score, label="F1 score")
        precision = np.asarray(precision)
        recall = np.asarray(recall)

        print("max f1_score =", max(f1_score))
        print("max precision =", precision[np.argwhere(f1_score == max(f1_score))])
        print("max recall =", recall[np.argwhere(f1_score == max(f1_score))])

        res = learner.predict_score(mpackTest, decoder=decoderMST, proba=True)
        pred = []

        for graph in res:
            pred.extend(graph.graph[0] != 2)

        vote.append(1 * pred)

        plt.title(
            "Logistic Regression AUPR=0.33\n F1 score with decoder :" + str(
                round(metrics.f1_score(label, 1 * pred), 2)))
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.grid(True)

    P = metrics.precision_score(label, pred)
    R = metrics.recall_score(label, pred)

    R_not_decoded = min(recall, key=lambda x: abs(x - R))
    P_not_decoded = max(precision[np.argwhere(recall == R_not_decoded)])
    print("Precision =", P)
    print("Recall =", R)
    print("Precision before decoding =", P_not_decoded)
    plt.plot(R, P, 'ro')

    plt.annotate('with decoder', xy=(R + .005, P + .005), xytext=(.8, .7),
                 arrowprops=dict(facecolor='black', shrink=0.02))

    plt.legend(loc=1, borderaxespad=0.)
    plt.savefig(FIGURES_FOLDER + "RegLogistic.pdf")
    plt.close()

    #########################################################################################
    with Torpor("Learning and Scoring Gradient Boost Regression"):
        learner._instantiateLearner(modelGradientBoostRegressor)
        learner.fit(mpackTrain)

        scores = learner.predict_score(mpackTest, dense=True)

        precision, recall, seuil = metrics.precision_recall_curve(label, scores)

        f1_score = [0]
        for s in seuil:
            f1_score.extend([metrics.f1_score(label, 1 * (scores > s))])

        precision = [x for (y, x) in sorted(zip(recall, precision))]
        f1_score = [x for (y, x) in sorted(zip(recall, f1_score))]
        recall.sort()
        # plt.plot(recall, precision)
        area = metrics.auc(recall, precision)
        print("Area =", area)

        plt.plot(recall, precision, label="PR")
        plt.plot(recall, f1_score, label="F1 score")
        precision = np.asarray(precision)
        recall = np.asarray(recall)

        print("max f1_score =", max(f1_score))
        print("max precision =", precision[np.argwhere(f1_score == max(f1_score))])
        print("max recall =", recall[np.argwhere(f1_score == max(f1_score))])

        res = learner.predict_score(mpackTest, decoder=decoder, dense=True)
        pred = []

        for graph in res:
            pred.extend(graph.graph[0] != 2)

        vote.append(1 * pred)

        plt.title("Gradient Boost Regression AUPR=0.69\n F1 score with decoder :" + str(
            round(metrics.f1_score(label, 1 * pred), 2)))
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.grid(True)

    P = metrics.precision_score(label, pred)
    R = metrics.recall_score(label, pred)

    R_not_decoded = min(recall, key=lambda x: abs(x - R))
    P_not_decoded = max(precision[np.argwhere(recall == R_not_decoded)])
    print("Precision =", P)
    print("Recall =", R)
    print("Precision before decoding =", P_not_decoded)
    plt.plot(R, P, 'ro')

    plt.annotate('with decoder', xy=(R + .005, P + .005), xytext=(.8, .7),
                 arrowprops=dict(facecolor='black', shrink=0.02))

    plt.legend(loc=1, borderaxespad=0.)
    plt.savefig(FIGURES_FOLDER + "GradientBoost.pdf")
    plt.close()

    ####################################################################################################
    with Torpor("Learning and Scoring Random Forest Regression"):
        learner = Learner(learner=modelRandomForestRegressor, n_estimators=20)
        learner.fit(mpackTrain)

        scores = learner.predict_score(mpackTest, dense=True)

        label = collectTargetLabels(mpackTest)

        precision, recall, seuil = metrics.precision_recall_curve(label, scores)

        print("len(seuil) =", len(seuil))
        print("seuil =", seuil)

        f1_score = [0]
        for s in seuil:
            f1_score.append(metrics.f1_score(label, 1 * (scores > s)))

        precision = [x for (y, x) in sorted(zip(recall, precision))]
        f1_score = [x for (y, x) in sorted(zip(recall, f1_score))]
        recall.sort()
        # plt.plot(recall, precision)
        area = metrics.auc(recall, precision)
        print("Area =", area)

        plt.plot(recall, precision, label="PR")
        plt.plot(recall, f1_score, label="F1 score")
        precision = np.asarray(precision)
        recall = np.asarray(recall)

        print("max f1_score =", max(f1_score))
        print("max precision =", precision[np.argwhere(f1_score == max(f1_score))])
        print("max recall =", recall[np.argwhere(f1_score == max(f1_score))])

        res = learner.predict_score(mpackTest, decoder=decoderMST, dense=True)
        pred = []

        for graph in res:
            pred.extend(graph.graph[0] != 2)

        vote.append(1 * pred)

        plt.title("Random Forest Regression AUPR=0.72\n F1 score with decoder :" + str(
            round(metrics.f1_score(label, 1 * pred), 2)))
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.grid(True)

    P = metrics.precision_score(label, pred)
    R = metrics.recall_score(label, pred)

    R_not_decoded = min(recall, key=lambda x: abs(x - R))
    P_not_decoded = max(precision[np.argwhere(recall == R_not_decoded)])
    print("Precision =", P)
    print("Recall =", R)
    print("Precision before decoding =", P_not_decoded)
    plt.plot(R, P, 'ro')

    plt.annotate('with decoder', xy=(R + .005, P + .005), xytext=(.8, .7),
                 arrowprops=dict(facecolor='black', shrink=0.02))

    plt.legend(loc=1, borderaxespad=0.)
    plt.savefig(FIGURES_FOLDER + "RandomForest.pdf")
    plt.close()
    # print("res =", res)

    pred = 1 * (np.mean(vote, axis=0) >= 0.4)

    print("Vote majorite\nf1_score =", round(metrics.f1_score(label, 1 * pred), 2))
    P = metrics.precision_score(label, pred)
    R = metrics.recall_score(label, pred)
    print("Precision =", P)
    print("Recall =", R)

    message("IRIT-STAC", "Travail termine")

    """