Ejemplo n.º 1
0
def _train_and_decode(HOME):
    random.seed(4)  # fix the random seed
    vsm = EMBEDDINGS_LEVY_DEPS_300  # vector space model to use
    lexicon = LEXICON_FULL_BRACKETS_FIX  # lexicon to use (mind the all_unknown setting!)
    multiword_averaging = False  # treatment of multiword predicates, false - use head embedding, true - use avg
    all_unknown = False  # makes the lexicon treat all LU as unknown, corresponds to the no-lex setting
    conf = Config(SharingDNNClassifier, SentenceBowMapper, lexicon, vsm,
                  multiword_averaging, all_unknown, None, None, None)

    print("Starting resource manager")
    sources = ResourceManager(HOME)

    print("Running the experiments!")
    g_train = get_graphs(*sources.get_corpus(CORPUS_DAS_TRAIN))
    lexicon = Lexicon()
    # go to configuration, check which lexicon is needed, locate the lexicon in FS, load the lexicon
    lexicon.load_from_list(sources.get_lexicon(conf.get_lexicon()))

    # same for VSM
    vsm = VSM(sources.get_vsm(conf.get_vsm()))
    mapper = conf.get_feat_extractor()(vsm, lexicon)

    # prepare the data
    X_train, y_train, lemmapos_train, gid_train = mapper.get_matrix(g_train)

    # train the model
    clf = conf.get_clf()(lexicon, conf.get_all_unknown(),
                         conf.get_num_components(), conf.get_max_sampled(),
                         conf.get_num_epochs())
    clf.train(X_train, y_train, lemmapos_train)

    # prepare test data
    g_test = get_graphs(*sources.get_corpus(CORPUS_DAS_TEST))
    X_test, y_test, lemmapos_test, gid_test = mapper.get_matrix(g_test)

    # predict and compare
    with open(os.path.join(HOME, 'test.frames.predicted'), 'w') as output_stream:
        for x, y_true, lemmapos, gid, g in zip(X_test, y_test, lemmapos_test,
                                               gid_test, g_test):
            y_predicted = clf.predict(x, lemmapos)
            print(lexicon.get_frame(y_predicted), file=output_stream)
Ejemplo n.º 2
0
def _train_all(HOME):
    random.seed(4)  # fix the random seed

    vsms = [EMBEDDINGS_LEVY_DEPS_300]  # vector space model to use
    lexicons = [LEXICON_FULL_BRACKETS_FIX]  # lexicon to use (mind the all_unknown setting!)
    multiword_averaging = [False]  # treatment of multiword predicates, false - use head embedding, true - use avg
    all_unknown = [False, True]  # makes the lexicon treat all LU as unknown, corresponds to the no-lex setting

    # WSABIE params
    num_components = [1500]
    max_sampled = [10]  # maximum number of negative samples used during WARP fitting 'warp'
    num_epochs = [500]

    configs = []
    for lexicon in lexicons:
        for all_unk in all_unknown:
            # DummyMapper doesn't do anything
            configs += [Config(DataMajorityBaseline, DummyMapper, lexicon, None, False, all_unk, None, None, None)]
            configs += [Config(LexiconMajorityBaseline, DummyMapper, lexicon, None, False, all_unk, None, None, None)]

    # Add configurations for NN classifiers
    for lexicon in lexicons:
        for vsm in vsms:
            for mwa in multiword_averaging:
                for all_unk in all_unknown:
                   configs += [Config(SharingDNNClassifier, SentenceBowMapper, lexicon, vsm, mwa, all_unk, None, None, None)]
                   configs += [Config(SharingDNNClassifier, DependentsBowMapper, lexicon, vsm, mwa, all_unk, None, None, None)]

    # Add configurations for WSABIE classifiers
    for lexicon in lexicons:
        for vsm in vsms:
            for mwa in multiword_averaging:
                for all_unk in all_unknown:
                    for num_comp in num_components:
                       for max_sampl in max_sampled:
                            for num_ep in num_epochs:
                                configs += [Config(WsabieClassifier, SentenceBowMapper, lexicon, vsm, mwa, all_unk, num_comp, max_sampl, num_ep)]
                                configs += [Config(WsabieClassifier, DependentsBowMapper, lexicon, vsm, mwa, all_unk, num_comp, max_sampl, num_ep)]

    print("Starting resource manager")
    sources = ResourceManager(HOME)

    print("Initializing reporters")
    reports = ReportManager(sources.out)

    print("Running the experiments!")
    runs = len(configs)*len(CORPORA_TRAIN)*len(CORPORA_TEST)
    print (len(configs), "configurations, ", len(CORPORA_TRAIN)*len(CORPORA_TEST), " train-test pairs -> ", \
        runs, " runs")

    current_train = 0
    current_config = 0
    current_test = 0
    for corpus_train in CORPORA_TRAIN:
        current_train += 1
        current_config = 0

        g_train = get_graphs(*sources.get_corpus(corpus_train))
        reports.conll_reporter_train.report(g_train)

        for conf in configs:
            current_config += 1
            start_time = time.time()

            lexicon = Lexicon()
            # go to configuration, check which lexicon is needed, locate the lexicon in FS, load the lexicon
            lexicon.load_from_list(sources.get_lexicon(conf.get_lexicon()))
            reports.lexicon_reporter.report(lexicon)

            # same for VSM
            vsm = VSM(sources.get_vsm(conf.get_vsm()))
            mapper = conf.get_feat_extractor()(vsm, lexicon)

            # prepare the data
            X_train, y_train, lemmapos_train, gid_train = mapper.get_matrix(g_train)

            # train the model
            clf = conf.get_clf()(lexicon, conf.get_all_unknown(), conf.get_num_components(), conf.get_max_sampled(),
                                 conf.get_num_epochs())
            clf.train(X_train, y_train, lemmapos_train)

            current_test = 0
            for corpus_test in CORPORA_TEST:
                score = Score()  # storage for scores
                score_v = Score()  # storage for verb-only scores
                score_known = Score()  # storage for known lemma-only scores

                start_time = time.time()

                reports.set_config(conf, corpus_train, corpus_test)

                current_test += 1

                # prepare test data
                g_test = get_graphs(*sources.get_corpus(corpus_test))
                reports.conll_reporter_test.report(g_test)
                X_test, y_test, lemmapos_test, gid_test = mapper.get_matrix(g_test)

                # predict and compare
                for x, y_true, lemmapos, gid, g in zip(X_test, y_test, lemmapos_test, gid_test, g_test):
                    y_predicted = clf.predict(x, lemmapos)
                    correct = y_true == y_predicted

                    score.consume(correct, lexicon.is_ambiguous(lemmapos), lexicon.is_unknown(lemmapos), y_true)
                    if lemmapos.endswith(".v"):
                        score_v.consume(correct, lexicon.is_ambiguous(lemmapos), lexicon.is_unknown(lemmapos), y_true)
                    if not lexicon.is_unknown(lemmapos):
                        score_known.consume(correct, lexicon.is_ambiguous(lemmapos), lexicon.is_unknown(lemmapos), y_true)

                    reports.result_reporter.report(gid, g, lemmapos, y_predicted, y_true, lexicon)
                reports.summary_reporter.report(corpus_train, corpus_test, conf, score, time.time() - start_time)
                reports.summary_reporter_v.report(corpus_train, corpus_test, conf, score_v, time.time() - start_time)
                reports.summary_reporter_known.report(corpus_train, corpus_test, conf, score_known, time.time() - start_time)

                print ("============ STATUS: - train", current_train, "/", len(CORPORA_TRAIN),
                    "conf", current_config, "/", len(configs),
                    "test", current_test, "/", len(CORPORA_TEST))
Ejemplo n.º 3
0
    print "Initializing reporters"
    reports = ReportManager(sources.out)

    print "Running the experiments!"
    runs = len(configs) * len(CORPORA_TRAIN) * len(CORPORA_TEST)
    print len(configs), "configurations, ", len(CORPORA_TRAIN)*len(CORPORA_TEST), " train-test pairs -> ", \
        runs, " runs"

    current_train = 0
    current_config = 0
    current_test = 0
    for corpus_train in CORPORA_TRAIN:
        current_train += 1
        current_config = 0

        g_train = get_graphs(*sources.get_corpus(corpus_train))
        reports.conll_reporter_train.report(g_train)

        for conf in configs:
            current_config += 1
            start_time = time.time()

            lexicon = Lexicon()
            # go to configuration, check which lexicon is needed, locate the lexicon in FS, load the lexicon
            lexicon.load_from_list(sources.get_lexicon(conf.get_lexicon()))
            reports.lexicon_reporter.report(lexicon)

            # same for VSM
            vsm = VSM(sources.get_vsm(conf.get_vsm()))
            mapper = conf.get_feat_extractor()(vsm, lexicon)