コード例 #1
0
ファイル: POSTagger.py プロジェクト: Juicechuan/POSTagger
 def __init__(self, model):
     self.model = model
     self.perceptron = SWPerceptron(model)
     self.errlog = sys.stderr
コード例 #2
0
ファイル: POSTagger.py プロジェクト: Juicechuan/POSTagger
class POSTagger:

    # model = None
    def __init__(self, model):
        self.model = model
        self.perceptron = SWPerceptron(model)
        self.errlog = sys.stderr

    def tag(self, test_instances, interval=500):
        start_time = time.time()
        prev, prev2 = START
        c = 0.0
        n = 0.0
        for sent_id, inst in enumerate(test_instances, 1):
            context = START + [self._normalize(w) for w, t in inst] + END
            for i, (word, gold_tag) in enumerate(inst):
                best_tag = self.model.tagdict.get(word)
                if not best_tag:
                    feats = self._get_features(i, word, context, prev, prev2)
                    # validTagset = self.model.counts[word].keys()
                    # if not validTagset: validTagset = self.model.class_codebook.labels()
                    best_tag = self.perceptron.predict(feats, train=False)
                prev2 = prev
                prev = best_tag
                c += best_tag == gold_tag
                n += 1.0

            if sent_id % interval == 0:
                p = c / n
                print >> self.errlog, "Over " + str(sent_id) + " sentences ", "Accuracy:%s" % (p)

        print >> self.errlog, "One pass on %s sentences takes %s" % (
            str(sent_id),
            datetime.timedelta(seconds=round(time.time() - start_time, 0)),
        )
        pt = c / n
        # r = n_correct_total/n_gold_total
        # f = 2*p*r/(p+r)
        print >> self.errlog, "Total Accuracy: %s" % (pt)

    def train(self, train_instances, interval=500):
        start_time = time.time()
        prev, prev2 = START
        c = 0.0
        n = 0.0
        for sent_id, inst in enumerate(train_instances, 1):
            context = START + [self._normalize(w) for w, t in inst] + END
            for i, (word, gold_tag) in enumerate(inst):
                best_tag = self.model.tagdict.get(word)
                if not best_tag:
                    feats = self._get_features(i, word, context, prev, prev2)
                    # validTagset = self.model.counts[word].keys()
                    best_tag = self.perceptron.predict(feats)
                    if best_tag != gold_tag:
                        self.perceptron.update_weight(gold_tag, best_tag, feats)
                    else:
                        self.perceptron.no_update()
                prev2 = prev
                prev = best_tag
                c += best_tag == gold_tag
                n += 1.0

            if sent_id % interval == 0:
                p = c / n
                print >> self.errlog, "Over " + str(sent_id) + " sentences ", "Accuracy:%s" % (p)

        print >> self.errlog, "One pass on %s sentences takes %s" % (
            str(sent_id),
            datetime.timedelta(seconds=round(time.time() - start_time, 0)),
        )
        pt = c / n
        # r = n_correct_total/n_gold_total
        # f = 2*p*r/(p+r)
        print >> self.errlog, "Total Accuracy: %s" % (pt)

    def _normalize(self, word):
        """Normalization used in pre-processing.

        - All words are lower cased
        - Digits in the range 1800-2100 are represented as !YEAR;
        - Other digits are represented as !DIGITS

        :rtype: str
        """
        if "-" in word and word[0] != "-":
            return "!HYPHEN"
        elif word.isdigit() and len(word) == 4:
            return "!YEAR"
        elif word[0].isdigit():
            return "!DIGITS"
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2):
        """
           Map tokens-in-contexts into a feature representation
        """

        def add(name, *args):
            features.add("+".join((name,) + tuple(args)))

        features = set()
        add("bias")  # This acts sort of like a prior
        add("i suffix", word[-3:])
        add("i pref1", word[0])
        add("i-1 tag", prev)
        add("i-2 tag", prev2)
        add("i tag+i-2 tag", prev, prev2)
        add("i word", context[i])
        add("i-1 tag+i word", prev, context[i])
        add("i-1 word", context[i - 1])
        add("i-1 suffix", context[i - 1][-3:])
        add("i-2 word", context[i - 2])
        add("i+1 word", context[i + 1])
        add("i+1 suffix", context[i + 1][-3:])
        add("i+2 word", context[i + 2])
        return features