Esempio n. 1
0
def sub_run(sub_train_path, sub_model_path, raw_test_path, sub_tag_path, test_corpus, sub_corpus):
    # Next, train the parser.
    stanford_tagger.train(sub_train_path, sub_model_path)

    # Now, run it.
    stanford_tagger.test(raw_test_path, sub_model_path, sub_tag_path)

    num_tokens = sum([len(i) for i in sub_corpus])

    # Load the result of the tagging...
    result_corpus = POSCorpus.read_slashtags(sub_tag_path)

    acc = pos_eval.poseval(result_corpus, test_corpus)
    return (num_tokens, acc)
Esempio n. 2
0
def full_run(c):
    # The step of sentences by which to increment.
    step_increment = 50

    curve_dir = os.path.abspath(writedir(c["curve_dir"]))
    train_path = c["train_path"]
    test_path = c["test_path"]

    train_corpus = POSCorpus.read_slashtags(train_path)
    test_corpus = POSCorpus.read_slashtags(test_path)

    # Let's go ahead and strip the tags from the test corpus.
    raw_test_name = "test_data.txt"
    raw_test_path = os.path.join(curve_dir, raw_test_name)

    test_corpus.write(raw_test_name, "raw", outdir=curve_dir)

    # Now, let's add 100 sentences at a time until we max out.
    sent_limit = 0
    p = Pool(8)

    results = {}

    while sent_limit < len(train_corpus):

        # Adding 100 to the limit, starting from zero, means
        # we will get the last <99 instances too.
        actual_limit = sent_limit + step_increment
        sub_corpus = POSCorpus(train_corpus[0:actual_limit])

        # Let's make the x values the number of tokens instead of sentences...

        # Let's create the necessary filenames.
        sub_train_path = os.path.join(curve_dir, "%d_train.txt" % actual_limit)
        sub_model_path = os.path.join(curve_dir, "%d_train.model" % actual_limit)
        sub_tag_path = os.path.join(curve_dir, "%d_tagged.txt" % actual_limit)

        # Get the number of tokens in the corpus for our x axis...
        num_tokens = sum([len(x) for x in sub_corpus])

        sub_corpus.write(os.path.basename(sub_train_path), "slashtags", outdir=curve_dir)

        p.apply_async(
            sub_run,
            args=[sub_train_path, sub_model_path, raw_test_path, sub_tag_path, test_corpus, sub_corpus],
            callback=lambda x: results.update({x[0]: x[1]}),
        )

        # Now, increase the sentence limit
        sent_limit += step_increment

    p.close()
    p.join()

    # Also, define where we will store the curve points.
    curve_points = "curve_data.txt"
    curve_f = open(os.path.join(curve_dir, curve_points), "w")

    for size, acc in sorted(results.items(), key=lambda x: x[0]):
        curve_f.write("%d,%.2f\n" % (size, acc))

    curve_f.close()