Exemple #1
0
    for line in test_file:
        word = line.strip()

        if word:  # Nonempty line
            original_word = word
            # Check if word is absent in training set, if so, use _RARE_
            if word not in counter.all_words or counter.word_counts[word] < 5:
                word = "_RARE_"

            # Initialize dict to hold emission values
            candidates = defaultdict(float)

            # Iterate through tags
            for tag in counter.all_states:

                prob = counter.calc_emissions(word, tag)

                # Make sure not to do log(0)
                if prob == 0.0:
                    candidates[tag] = float("-inf")
                else:
                    candidates[tag] = math.log(prob)

            # Get argmax of candidates
            pred = max(candidates.iteritems(), key=operator.itemgetter(1))[0]

            # Write prediction to output file
            sys.stdout.write("%s %s %s\n" %
                             (original_word, pred, str(candidates[pred])))
        else:
            print ""
Exemple #2
0
                    word = "_CF_"
                elif all(c in string.punctuation or c.isdigit() for c in word):
                    word = "_NP_"
                else:
                    word = "_RARE_"

            # Iterate over u and v
            for u in K[k - 1]:
                for v in K[k]:

                    # Find max over w in K[k-2]
                    w_candidates = defaultdict(float)

                    for w in K[k - 2]:
                        w_candidates[w] = pi[k - 1][(w, u)] * counter.calc_mle(
                            [w, u, v]) * counter.calc_emissions(word, v)

                    final_w = max(w_candidates.iteritems(),
                                  key=operator.itemgetter(1))

                    # Assign pi value
                    pi[k][(u, v)] = final_w[1]

            # Get the (tag, probability) of v in max(pi[k](u,v))
            final_k_idx = max(pi[k].iteritems(), key=operator.itemgetter(1))

            prob = final_k_idx[1]
            # Log probability
            log_prob = math.log(prob)
            # Ouput format: word, tag, log probability
            sys.stdout.write("%s %s %s\n" %
            word = pad_sent[k+1]
            original_word = pad_sent[k+1]

            # Check if word is absent in training set or count(word) < 5, if so, use _RARE_
            if word not in counter.all_words or counter.word_counts[word] < 5:
                word = "_RARE_"

            # Iterate over u and v
            for u in K[k-1]:
                for v in K[k]:

                    # Find max over w in K[k-2]
                    w_candidates = defaultdict(float)

                    for w in K[k-2]:
                        w_candidates[w] = pi[k-1][(w,u)] * counter.calc_mle([w,u,v]) * counter.calc_emissions(word,v)

                    final_w = max(w_candidates.iteritems(), key=operator.itemgetter(1))

                    # Assign pi value
                    pi[k][(u,v)] = final_w[1]

            # Get the (tag, probability) of v in max(pi[k](u,v))
            final_k_idx = max(pi[k].iteritems(), key=operator.itemgetter(1))

            prob = final_k_idx[1]
            # Log probability
            log_prob = math.log(prob)
            # Ouput format: word, tag, log probability
            sys.stdout.write("%s %s %s\n" % (original_word, final_k_idx[0][1], log_prob))