Example #1
0
def perceptron(weights, data_name, histories_name, gold_name, k):
    for i in range(0,k):
        # 1. Use model [weights] to assign a weight to each history.
        weighted_history(data_name,histories_name,weights,"q5_weighted",False,True)
        # 2. Find the histories of the highest-scoring tagging using tagger_decoder.py.
        tags = open("q5_best", "w")
        weighted = open("q5_weighted", "r")
        call(["python", "tagger_decoder.py", "HISTORY"], stdout=tags, stdin=weighted)
        # 3. Compare highest-scoring tagging with gold standard, update weights;
        # Update rule; increase accurate features +1, inaccurate -1.
        data = open(data_name,"r")
        gold = open(gold_name,"r")
        tags = open("q5_best","r")
        for word in data.readlines():
            if word != '\n':
                word = word.strip().split()[0]
                tag = tags.readline().strip().split()[2]
                gold_tag = gold.readline().strip().split()[2]
                increase = (tag == gold_tag)
                update_suffix_weights(word, tag, weights, increase)
            else:
                tag = tags.readline()
                tag = tags.readline()
                gold_tag = gold.readline()
        data.close()
        gold.close()
        tags.close()
Example #2
0
def main():
    # Step 1: Read in training data, initialize dictionary of features with weight 0.
    weights = init_suffix_weights("tag_train.dat")
    # Step 2: Get the gold tag histories using tagger history generator.py
    gold = open("q5_gold", "w")
    train_data = open("tag_train.dat", "r")
    call(["python", "tagger_history_generator.py", "GOLD"], stdout=gold, stdin=train_data)
    # Step 3: Enumerate all possible histories
    train_data = open("tag_train.dat", "r")
    histories = open("q5_histories", "w")
    call(["python", "tagger_history_generator.py", "ENUM"], stdout=histories, stdin=train_data)
    # Step 4: Run Perceptron k=4 times.
    perceptron(weights, "tag_train.dat", "q5_histories", "q5_gold", 5)
    # Step 5: Write the final model out to suffix_tagger.model.
    final_model = file("suffix_tagger.model", "w")
    for key in weights:
        line = key + " " + str(weights[key])
        print(line, file=final_model)

    # Run model with suffix, tag and bigram features on development data
    tag_weights = tagmodel_weights()
    weights.update(tag_weights)
    weighted_history("tag_dev.dat","q4_histories",weights,"q5_weighted",True,True)
    best_tag = open("q5_best", "w")
    weighted = open("q5_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q5_best","q5_output")
    remove("q5_best")
    remove("q5_histories")
    remove("q5_weighted")
    remove("q5_gold")
Example #3
0
def decode():
    # read tag.model into a map from feature strings to weights.
    weights = tagmodel_weights()
    # For each sentence in development data (Steps 1-4):
    # 1. Enumerate all possible histories
    histories = open("q4_histories", "w")
    data = open("tag_dev.dat", "r")
    call(["python", "tagger_history_generator.py", "ENUM"], stdout=histories, stdin=data)
    # 2. Compute the features for each history and use tag.model to assign a weight to each history
    weighted_history("tag_dev.dat","q4_histories",weights,"q4_weighted",True,False)
    # 3. Call tagger_decoder.py HISTORY and pipe in the weighted histories to compute the highest scoring tagging.
    best_tag = open("q4_best", "w")
    weighted = open("q4_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q4_best","q4_output")
    remove("q4_best")
    remove("q4_weighted")
Example #4
0
def main():
    # Get suffix, tag and bigram feature vectors generated in quesiton4.py and question5.py
    weights = tagmodel_weights()
    weights.update(suffix_weights("suffix_tagger.model"))

    # Combo 1: Modify certain suffix rules: ============================================
    weights_1 = weights
    # suffix "ly" is usually ADV; increase weight
    weights_1["SUFFIX:ly:ADV"] = float(weights["SUFFIX:ly:ADV"]) + 3
    # suffix "ed" is usually VERB; increase weight
    weights_1["SUFFIX:ed:VERB"] = float(weights["SUFFIX:ed:VERB"]) + 3
    # suffix "ing" is usually VERB; increase weight
    weights_1["SUFFIX:ing:VERB"] = float(weights["SUFFIX:ing:VERB"]) + 0.05
    # Run model with suffix, tag and bigram features on development data
    weighted_history("tag_dev.dat","q4_histories",weights_1,"q6_weighted",True,True)
    best_tag = open("q6_best", "w")
    weighted = open("q6_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q6_best","q6_output_combo1")

    # Combo 2: Modify one bigram rule ==================================================
    weights_2 = weights
    # bigram "VERB VERB" is often wrong; decrease weight
    weights_2["BIGRAM:VERB:VERB"] = -0.5
    # Run model with suffix, tag and bigram features on development data
    weighted_history("tag_dev.dat","q4_histories",weights_2,"q6_weighted",True,True)
    best_tag = open("q6_best", "w")
    weighted = open("q6_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q6_best","q6_output_combo2")


    # Combo 3: Add content rules: =====================================================
    weights_3 = weights
    # If a word has a hyphen, tag as ADJ
    weights_3["CONTAINS:HYPHEN:ADJ"] = 5
    # If word has digits, tag as NUM
    weights_3["CONTAINS:DIGIT:NUM"] = 5
    weighted_history2("tag_dev.dat","q4_histories",weights_3,"q6_weighted",True,True)
    best_tag = open("q6_best", "w")
    weighted = open("q6_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q6_best","q6_output_combo3")
    remove("q6_best")

    # Combo 4: All together now: =====================================================
    # suffix "ly" is usually ADV; increase weight
    weights["SUFFIX:ly:ADV"] = float(weights["SUFFIX:ly:ADV"]) + 3
    # suffix "ed" is usually VERB; increase weight
    weights["SUFFIX:ed:VERB"] = float(weights["SUFFIX:ed:VERB"]) + 3
    # No use to add to ["SUFFIX:ing:VERB"] feature
    # bigram "VERB VERB" is often wrong; decrease weight
    # weights["BIGRAM:VERB:VERB"] = -0.5
    # If a word has a hyphen, tag as ADJ
    weights["CONTAINS:HYPHEN:ADJ"] = 5
    # If word has digits, tag as NUM
    weights["CONTAINS:DIGIT:NUM"] = 5
    weighted_history2("tag_dev.dat","q4_histories",weights,"q6_weighted",True,True)
    best_tag = open("q6_best", "w")
    weighted = open("q6_weighted", "r")
    call(["python", "tagger_decoder.py", "HISTORY"], stdout=best_tag, stdin=weighted)
    # 4. Save file with word-tag combos
    set_tags("tag_dev.dat","q6_best","q6_output_combo4")

    remove("q6_best")
    remove("q6_weighted")
    remove("q4_histories")