Exemple #1
0
def main():
    # Step 1: Read in training data, initialize dictionary of features with weight 0.
    weights = init_suffix_weights("tag_train.dat")
    # Step 2: Get the gold tag histories using tagger history generator.py
    gold = open("q5_gold", "w")
    train_data = open("tag_train.dat", "r")
    call(["python", "tagger_history_generator.py", "GOLD"], stdout=gold, stdin=train_data)
    # Step 3: Enumerate all possible histories
    train_data = open("tag_train.dat", "r")
    histories = open("q5_histories", "w")
    call(["python", "tagger_history_generator.py", "ENUM"], stdout=histories, stdin=train_data)
    # Step 4: Run Perceptron k=4 times.
    perceptron(weights, "tag_train.dat", "q5_histories", "q5_gold", 5)
    # Step 5: Write the final model out to suffix_tagger.model.
    final_model = file("suffix_tagger.model", "w")
    for key in weights:
        line = key + " " + str(weights[key])
        print(line, file=final_model)

    # Run model with suffix, tag and bigram features on development data
    tag_weights = tagmodel_weights()
    weights.update(tag_weights)
    weighted_history("tag_dev.dat","q4_histories",weights,"q5_weighted",True,True)
    best_tag = open("q5_best", "w")
    weighted = open("q5_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q5_best","q5_output")
    remove("q5_best")
    remove("q5_histories")
    remove("q5_weighted")
    remove("q5_gold")
Exemple #2
0
def decode():
    # read tag.model into a map from feature strings to weights.
    weights = tagmodel_weights()
    # For each sentence in development data (Steps 1-4):
    # 1. Enumerate all possible histories
    histories = open("q4_histories", "w")
    data = open("tag_dev.dat", "r")
    call(["python", "tagger_history_generator.py", "ENUM"], stdout=histories, stdin=data)
    # 2. Compute the features for each history and use tag.model to assign a weight to each history
    weighted_history("tag_dev.dat","q4_histories",weights,"q4_weighted",True,False)
    # 3. Call tagger_decoder.py HISTORY and pipe in the weighted histories to compute the highest scoring tagging.
    best_tag = open("q4_best", "w")
    weighted = open("q4_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q4_best","q4_output")
    remove("q4_best")
    remove("q4_weighted")
Exemple #3
0
def main():
    # Get suffix, tag and bigram feature vectors generated in quesiton4.py and question5.py
    weights = tagmodel_weights()
    weights.update(suffix_weights("suffix_tagger.model"))

    # Combo 1: Modify certain suffix rules: ============================================
    weights_1 = weights
    # suffix "ly" is usually ADV; increase weight
    weights_1["SUFFIX:ly:ADV"] = float(weights["SUFFIX:ly:ADV"]) + 3
    # suffix "ed" is usually VERB; increase weight
    weights_1["SUFFIX:ed:VERB"] = float(weights["SUFFIX:ed:VERB"]) + 3
    # suffix "ing" is usually VERB; increase weight
    weights_1["SUFFIX:ing:VERB"] = float(weights["SUFFIX:ing:VERB"]) + 0.05
    # Run model with suffix, tag and bigram features on development data
    weighted_history("tag_dev.dat","q4_histories",weights_1,"q6_weighted",True,True)
    best_tag = open("q6_best", "w")
    weighted = open("q6_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q6_best","q6_output_combo1")

    # Combo 2: Modify one bigram rule ==================================================
    weights_2 = weights
    # bigram "VERB VERB" is often wrong; decrease weight
    weights_2["BIGRAM:VERB:VERB"] = -0.5
    # Run model with suffix, tag and bigram features on development data
    weighted_history("tag_dev.dat","q4_histories",weights_2,"q6_weighted",True,True)
    best_tag = open("q6_best", "w")
    weighted = open("q6_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q6_best","q6_output_combo2")


    # Combo 3: Add content rules: =====================================================
    weights_3 = weights
    # If a word has a hyphen, tag as ADJ
    weights_3["CONTAINS:HYPHEN:ADJ"] = 5
    # If word has digits, tag as NUM
    weights_3["CONTAINS:DIGIT:NUM"] = 5
    weighted_history2("tag_dev.dat","q4_histories",weights_3,"q6_weighted",True,True)
    best_tag = open("q6_best", "w")
    weighted = open("q6_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q6_best","q6_output_combo3")
    remove("q6_best")

    # Combo 4: All together now: =====================================================
    # suffix "ly" is usually ADV; increase weight
    weights["SUFFIX:ly:ADV"] = float(weights["SUFFIX:ly:ADV"]) + 3
    # suffix "ed" is usually VERB; increase weight
    weights["SUFFIX:ed:VERB"] = float(weights["SUFFIX:ed:VERB"]) + 3
    # No use to add to ["SUFFIX:ing:VERB"] feature
    # bigram "VERB VERB" is often wrong; decrease weight
    # weights["BIGRAM:VERB:VERB"] = -0.5
    # If a word has a hyphen, tag as ADJ
    weights["CONTAINS:HYPHEN:ADJ"] = 5
    # If word has digits, tag as NUM
    weights["CONTAINS:DIGIT:NUM"] = 5
    weighted_history2("tag_dev.dat","q4_histories",weights,"q6_weighted",True,True)
    best_tag = open("q6_best", "w")
    weighted = open("q6_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q6_best","q6_output_combo4")

    remove("q6_best")
    remove("q6_weighted")
    remove("q4_histories")