コード例 #1
0
ファイル: labeler.py プロジェクト: ranxian/postagger
class Labeler:
    def __init__(self):
        """map of single tag words"""
        self.single_tag_words = {}
        self.chk_set = set()
        self.perceptron = Perceptron()

    def train(self, sents, niter):
        # make single_tag_words
        self.perceptron.reset()
        roled_sents = sents
        # self._make_stw(roled_sents)
        self.role_set = set(role for sent in roled_sents for (word, tag, chunk, role) in sent)
        self.perceptron.tag_set = self.role_set
        length = int(len(roled_sents))

        for iteration in range(niter):
            ncorrect = 0
            ntotal = 0
            for sent in roled_sents[:length]:
                sent = [(self._normalize(word), tag, chk, role) for (word, tag, chk, role) in sent]
                for idx, (word, tag, chk, role) in enumerate(sent):
                    # pred = self.single_tag_words.get(word)
                    pred = None
                    if not pred:
                        features = self._get_features(idx, sent)
                        pred = self.perceptron.predict(features)
                        self.perceptron.update(role, pred, features)
                    # successful prediction
                    ncorrect += pred == role
                    ntotal += 1
            random.shuffle(roled_sents)
            print "iteration #{0}, {1}/{2}=precision: {3}".format(iteration, ncorrect, ntotal, ncorrect / ntotal)

        self.perceptron.average_weights()

    def _make_stw(self, chked_sents):
        counts = defaultdict(lambda: defaultdict(int))
        for sent in chked_sents:
            sent = [(self._normalize(word), tag, chunk, role) for (word, tag, chunk, role) in sent]
            for word, tag, chunk, role in sent:
                counts[word][role] += 1

        threshold = 0.95
        freqthres = 15

        for word, tag_freqs in counts.items():
            role, freq = max(tag_freqs.items(), key=lambda item: item[1])
            total = sum(tag_freqs.values())
            if freq >= freqthres and freq / total >= threshold:  # unambiguity
                self.single_tag_words[word] = role
            elif freq == total and total >= 3:
                self.single_tag_words[word] = role

    def _normalize(self, word):
        def isnum(word):
            return word.endswith(tuple("一 二 三 四 五 六 七 八 九 十 百 千 万 亿 两".split(" ")))

        def iscal(word):
            return word.endswith(("年", "月", "日", "年代"))

        if isnum(word):
            return "NUM"

        if iscal(word):
            return "CAL"

        if word.endswith(("省", "市", "区", "州", "县", "镇", "乡", "街")):
            return "LOCATION"

        return word

    def _make_features(self, current, prev1, prev2, fword1, fword2):
        def add(name, *args):
            features["_".join((name,) + tuple(args))] = 1

        word, tag, chk, role = current
        pword1, ptag1, pchk1, prole1 = prev1
        pword2, ptag2, pchk2, prole2 = prev2
        fword1, ftag1, fchk1 = fword1
        fword2, ftag2, fchk2 = fword2

        features = defaultdict(int)

        if chk != "VP":
            add("i chunk", chk)
        add("bias")
        add("i word", word)
        add("i tag", tag)

        add("i-1 word", pword1)
        add("i-1 tag", ptag1)
        add("i-1 role", prole1)
        add("i-1 chunk", pchk1)

        add("i-2 tag", ptag2)
        add("i-2 word", pword2)
        add("i-2 role", prole2)
        add("i-2 chunk", pchk2)

        add("i+1 word", fword1)
        add("i+1 tag", ftag1)
        add("i+1 chunk", pchk1)

        add("i+2 word", fword2)
        add("i+2 tag", ftag2)
        add("i+2 chunk", pchk2)

        if prole1[0] == "E" or prole1[0] == "O":
            add("out role")
        elif prole1[0] == "I" or prole1[0] == "B":
            add("in role", prole1.split("-")[1])

        return features

    # current, prev1, prev2, after1, after2
    def _get_features(self, i, sent):
        def add(name, *args):
            features["_".join((name,) + tuple(args))] = 1

        def deletes(word):
            return word[1:] if word[0] == "*" else word

        def pretag(j):
            if j < 0:
                return "START1_TAG"
            elif j >= len(sent):
                return "END1_TAG"
            else:
                word, pos, chk, role = sent[j]
                if pos == "PP":
                    return word
                else:
                    return pos

        def prechk(j):
            if j == -2:
                return "START2_CHK"
            elif j == -1:
                return "START1_CHK"
            elif j == len(sent):
                return "END1_CHK"
            elif j == len(sent) + 1:
                return "END2_CHK"
            else:
                return sent[j][2]

        pword1, ptag1, pchk1, prole1 = (
            ("START1_WORD", "START1_TAG", "START1_CHK", "START1_ROLE") if i <= 0 else sent[i - 1]
        )
        pword2, ptag2, pchk2, prole2 = (
            ("START2_WORD", "START2_TAG", "START2_CHK", "START2_ROLE") if i <= 1 else sent[i - 2]
        )
        word, tag, chk, role = sent[i]
        fword1, ftag1, fchk1, frole1 = (
            ("END1_WORD", "END1_TAG", "END1_CHK", "END1_ROLE") if i >= len(sent) - 1 else sent[i + 1]
        )
        fword2, ftag2, fchk2, frole2 = (
            ("END2_WORD", "END2_TAG", "END2_CHK", "END2_ROLE") if i >= len(sent) - 2 else sent[i + 2]
        )

        pword1 = deletes(pword1)
        pword2 = deletes(pword2)
        word = deletes(word)
        fword1 = deletes(fword1)
        fword2 = deletes(fword2)

        sent_len = len(sent)

        features = defaultdict(int)

        hasa0 = False
        for j in range(0, i):
            if "A0" in sent[j][3]:
                hasa0 = True
                break
        add("i has-A0") if hasa0 else add("i No-A0")

        pred_pos = 0
        predicate = None

        add("i pos", str(i))

        if word[0] == "*":
            add("i is-predicate")
            word = word[:1]
            pred_pos = i
        else:
            for j in range(len(sent)):
                if sent[j][0][0] == "*":
                    pred_pos = j
                    predicate = sent[j]

            if pred_pos < i:
                add("i before")
            else:
                add("i after")

            r = range(i, pred_pos + 1) if i < pred_pos else range(pred_pos, i + 1)
            r2 = range(i + 1, pred_pos) if i < pred_pos else range(pred_pos + 1, i)

            path = []
            nbp, nvp, nnp = 0, 0, 0
            for j in r:
                if j == i:
                    path.append(word)
                elif j == pred_pos:
                    path.append(sent[j][1])
                else:
                    path.append(sent[j][2])

            for j in r2:
                if sent[j][2] != "O":
                    nbp += 1
                if sent[j][2] == "VP":
                    nvp += 1
                if sent[j][2] == "NP":
                    nnp += 1
            path = "-".join(path)
            add("i path", path)
            add("i D-BP", str(nbp))
            add("i D-VP", str(nvp))
            add("i D-NP", str(nnp))

        predicate = sent[pred_pos]
        pre_word, pre_pos, pre_chk, pre_role = predicate
        pre_word = pre_word[1:]
        pre_role = "E-V"

        add("pred", pre_word)
        add("pred-tag", pre_pos)
        add("pred-before-tag", pretag(pred_pos - 1))
        add("pred-after-tag", pretag(pred_pos + 1))
        add("pred pos", str(pred_pos))

        add("pred-1 bp", prechk(pred_pos - 1))
        add("pred-2 bp", prechk(pred_pos - 2))
        add("pred+1 bp", prechk(pred_pos + 1))
        add("pred+2 bp", prechk(pred_pos + 2))

        if i == 0:
            add("i begin")

        if i == len(sent) - 1:
            add("i end")

        add("i chunk", chk)
        add("bias")
        add("i word", word)
        add("i tag", tag)
        add("i suffix2", word[-6:])
        add("i suffix1", word[-3:])

        add("i-1 word", pword1)
        add("i-1 tag", ptag1)
        add("i-1 role", prole1)
        add("i-1 chunk", pchk1)

        add("i-2 tag", ptag2)
        add("i-2 word", pword2)
        add("i-2 role", prole2)
        add("i-2 chunk", pchk2)

        add("i+1 word", fword1)
        add("i+1 tag", ftag1)
        add("i+1 chunk", pchk1)

        add("i+2 word", fword2)
        add("i+2 tag", ftag2)
        add("i+2 chunk", pchk2)

        if prole1[0] == "E" or prole1[0] == "O":
            add("out role")
        elif prole1[0] == "I" or prole1[0] == "B":
            add("in role", prole1[2:])

        return features

    def tag(self, tagged_sent):
        roled = [[self._normalize(word), tag, chk, None] for word, tag, chk in tagged_sent]

        for idx, (word, tag, chunk, role) in enumerate(roled):
            # pred = self.single_tag_words.get(word)
            pred = None
            if not pred:
                features = self._get_features(idx, roled)
                if features["i is-predicate"] == 1:
                    pred = "E-V"
                else:
                    pred = self.perceptron.predict(features)
            roled[idx][3] = pred

        in_bracket = False
        for idx, (word, tag, chunk, role) in enumerate(roled):
            if role[0] == "B" or (role[0] == "E" and role[1] == "B"):
                if in_bracket:
                    if role[0] == "E":  # in bracket, EB
                        j = idx - 1
                        while j > 0:
                            if roled[j][3][0] == "B":
                                break
                            j -= 1
                        roled[idx][3] = "E-" + roled[j][3][2:]
                        in_bracket = False
                    else:
                        roled[idx][3] = "I-" + roled[idx - 1][3][2:]
                else:
                    if not role[0] == "E":
                        in_bracket = True
            elif role[0] == "E":
                if in_bracket:
                    j = idx - 1
                    while j > 0:
                        if roled[j][3][0] == "B":
                            break
                        j -= 1
                    roled[idx][3] = "E-" + roled[j][3][2:]
                    in_bracket = False
                else:
                    roled[idx][3] = "EB-" + roled[idx][3][2:]
            else:
                if in_bracket:
                    if idx == len(roled) - 1:
                        j = idx - 1
                        while j > 0:
                            if roled[j][3][0] == "B":
                                break
                            j -= 1
                        roled[idx][3] = "E-" + roled[j][3][2:]
                    else:
                        roled[idx][3] = "I-" + roled[idx - 1][3][2:]
                else:
                    roled[idx][3] = "O"
            if in_bracket and idx == len(roled) - 1:
                j = idx - 1
                while j > 0:
                    if roled[j][3][0] == "B":
                        break
                    j -= 1
                roled[idx][3] = "E-" + roled[j][3][2:]

        for idx, (word, tag, chunk, role) in enumerate(roled):
            if role.startswith("EB"):
                roled[idx][3] = "E" + role[2:]

        return roled

    def tag2(self, sent):
        tagged = [[self._normalize(word), pos, chk, None] for word, pos, chk in sent]

        nword = len(sent)
        ntag = len(self.role_set)
        pi = [[[[0, None, None] for k in range(ntag)] for j in range(ntag)] for i in range(nword)]

        for i, (word, tag, chk, role) in enumerate(tagged):
            pword1, ptag1, pchk1 = ("START1_WORD", "START1_TAG", "START1_CHK") if i <= 0 else tagged[i - 1][:3]
            pword2, ptag2, pchk2 = ("START2_WORD", "START2_TAG", "START2_CHK") if i <= 1 else tagged[i - 2][:3]
            word, tag, chk = tagged[i][:3]
            fword1, ftag1, fchk1 = ("END1_WORD", "END1_TAG", "END1_CHK") if i >= len(tagged) - 1 else tagged[i + 1][:3]
            fword2, ftag2, fchk2 = ("END2_WORD", "END2_TAG", "END2_CHK") if i >= len(tagged) - 2 else tagged[i + 2][:3]
            for j, u in enumerate(self.role_set):
                prole2 = "START2_ROLE" if i <= 0 else u
                for k, v in enumerate(self.role_set):
                    prole1 = "START1_ROLE" if i <= 1 else v
                    for t, role in enumerate(self.role_set):
                        score = 0 if i <= 0 else pi[i - 1][t][j][0]
                        score += self.perceptron.get_score(
                            self._make_features(
                                (word, tag, chk, role),
                                (pword1, ptag1, pchk1, prole1),
                                (pword2, ptag2, pchk2, prole2),
                                (fword1, ftag1, fchk1),
                                (fword2, ftag2, fchk2),
                            ),
                            role,
                        )
                        if score > pi[i][j][k][0]:
                            pi[i][j][k][0] = score
                            pi[i][j][k][1] = role
                            pi[i][j][k][2] = t
        i = len(tagged) - 1
        t, j = None, None
        for j, u in enumerate(self.role_set):
            for k, v in enumerate(self.role_set):
                tag, t = pi[i][j][k][1:3]
                tagged[i][3] = tag
        i -= 1
        while i >= 0:
            tagged[i][3] = pi[i][t][j][1]
            j = t
            t = pi[i][t][j][2]
            i -= 1
        printc(tagged)

        return tagged

    def evaluate(self, roled_sents):
        ntotal = 0
        ncorrect = 0
        faults = []
        likely = {}
        faults_count = defaultdict(int)

        f = open("test.props.txt", "w")
        for roled_sent in roled_sents:
            tagged_sent = [(word, tag, chunk) for (word, tag, chunk, role) in roled_sent]
            roled = self.tag(tagged_sent)

            for idx, (word, tag, chunk, role) in enumerate(roled):
                thword = tagged_sent[idx][0]
                if thword[0] == "*":
                    thword = thword[1:]
                f.write("%s\t%s\t%s\t%s\n" % (thword, tag, chunk, role))
            f.write("\n")
            has_false = False
            for idx, (word, tag, chunk, role) in enumerate(roled_sent):
                ntotal += 1
                if role == roled[idx][3]:
                    ncorrect += 1
                else:
                    has_false = True
            if has_false:
                record = []
                for idx, (word, tag, chunk, role) in enumerate(roled_sent):
                    if role == roled[idx][3]:
                        record.append((word, tag, chunk, role))
                    else:
                        record.append((word, tag, chunk, role, "【" + roled[idx][3] + "】"))
                        faults_count[role + " is roled as " + roled[idx][3]] += 1
                faults.append(record)

        print "precision:", ncorrect / ntotal * 100, "%"
        sorted_fault_count = sorted(faults_count.items(), key=lambda item: item[1], reverse=True)
        f.close()
        # for key, value in sorted_fault_count:
        # print key, value
        return faults
コード例 #2
0
ファイル: chunker.py プロジェクト: ranxian/postagger
class Chunker:
    def __init__(self):
        '''map of single tag words'''
        self.single_tag_words = {}
        self.chk_set = set()
        self.perceptron = Perceptron()

    def train(self, sents, niter):
        # make single_tag_words
        self.perceptron.reset()
        chked_sents = sents
        self._make_stw(chked_sents)
        self.chk_set = set(chunk for sent in chked_sents for (word, tag, chunk) in sent)
        self.perceptron.tag_set = self.chk_set
        length = int(len(chked_sents))

        for iteration in range(niter):
            ncorrect = 0
            ntotal = 0
            for sent in chked_sents[:length]:
                sent = [(self._normalize(word), tag, chk) for (word, tag, chk) in sent]
                for idx, (word, tag, chk) in enumerate(sent):
                    pred = self.single_tag_words.get(word)
                    if not pred:
                        features = self._get_features(idx, sent)
                        pred = self.perceptron.predict(features)
                        self.perceptron.update(chk, pred, features)
                    # successful prediction
                    ncorrect += pred == chk
                    ntotal += 1
            random.shuffle(chked_sents)
            print "iteration #{0}, {1}/{2}=precision: {3}".format(iteration,
                                                          ncorrect, ntotal, ncorrect / ntotal)

        self.perceptron.average_weights()

    def _make_stw(self, chked_sents):
        counts = defaultdict(lambda: defaultdict(int))
        for sent in chked_sents:
            sent = [(self._normalize(word), tag, chunk) for (word, tag, chunk) in sent]
            for word, tag, chunk in sent:
                counts[word][chunk] += 1

        threshold = 0.95
        freqthres = 10

        for word, tag_freqs in counts.items():
            chunk, freq = max(tag_freqs.items(), key=lambda item: item[1])
            total = sum(tag_freqs.values())
            if freq >= freqthres and freq / total >= threshold:   # unambiguity
                self.single_tag_words[word] = chunk
            elif freq == total and total >= 3:
                self.single_tag_words[word] = chunk


    def _normalize(self, word):
        def isnum(word):
            return word.endswith(tuple('一 二 三 四 五 六 七 八 九 十 百 千 万 亿 两'.split(' ')))

        def iscal(word):
            return word.endswith(('年', '月', '日', '年代'))

        if (isnum(word)):
            return 'NUM'

        if (iscal(word)):
            return 'CAL'

        if (word.endswith(('省', '市', '区', '州', '县', '镇', '乡', '街'))):
            return 'LOCATION'

        return word

    # current, prev1, prev2, after1, after2
    def _get_features(self, i, sent):
        def add(name, *args):
            features['_'.join((name, ) + tuple(args))] = 1

        pword1, ptag1, pchk1 = ('START1_WORD', 'START1_TAG', 'EB-START1_CHK') if i <= 0 else sent[i-1]
        pword2, ptag2, pchk2 = ('START2_WORD', 'START2_TAG', 'EB-START2_CHK') if i <= 1 else sent[i-2]

        pchk1_p = pchk1.split('-')[1] if pchk1 != 'O' else pchk1
        pchk2_p = pchk2.split('-')[1] if pchk2 != 'O' else pchk2
        word, tag, chk = sent[i]
        fword1, ftag1, fchk1 = ('END1_WORD', 'END1_TAG', 'EB-END1_CHK') if i >= len(sent)-1 else sent[i+1]
        fword2, ftag2, fchk2 = ('END2_WORD', 'END2_TAG', 'EB-END2_CHK') if i >= len(sent)-2 else sent[i+2]

        features = defaultdict(int)
        add('bias')
        add('i word', word)
        add('i tag', tag)
        add('i tag prefix', tag[0])
        add('i suffix1', word[-3:])
        add('i-1 suffix1', word[-3:])

        add('i-1 word', pword1)
        add('i-1 tag', ptag1)
        add('i-1 tag prefix', ptag1[0])
        add('i-1 i word', pword1, word)
        # add('i-2 i-1 chunk', pchk2, pchk1)
        # add('i-2 i-1 chunk_p', pchk2_p, pchk1_p)
        # add('i-1 chunk', pchk1)
        # add('i-2 chunk', pchk2)

        add('i-i i pos', ptag1, tag)
        add('i i+1 pos', tag, ftag1)
        add('i i+1 i+2 pos', tag, ftag1, ftag2)
        add('i+1 word i pos', fword1, tag)
        add('i-1 word i pos', pword1, tag)
        add('i-1 pos i+1 pos', ptag1, ftag1)
        add('i pos i+2 pos', tag, ftag2)

        add('i-2 word', ptag2)
        add('i-2 tag', ptag2)
        add('i-2 tag prefix', ptag2[0])

        add('i+1 word', fword1)
        add('i+1 tag', ftag1)
        add('i+1 tag prefix', ftag1[0])

        add('i+2 word', fword2)
        add('i+2 tag', ftag2)
        add('i+2 tag prefix', ftag2[0])

        add('i-1 tag i tag i+1 tag', ptag1, tag, ftag1)
        add('i-1 tag i word i+1 tag', ptag1, word, ftag1)

        if pchk1[0] == 'E' or pchk1[0] == 'O':
            add('out chunk')
        elif pchk1[0] == 'I' or pchk1[0] == 'B':
            add('in chunk', pchk1[2:])

        for j in xrange(i-1, -1, -1):
            if sent[j][2][0] == 'E':
                add('before chunk', sent[j][2].split('-')[1])
                break

        if i == 0:
            add('i begin')
        elif i == len(sent)-1:
            add('i end')

        return features

    def tag(self, tagged_sent):
        chked = [[self._normalize(word), tag, None] for word, tag in tagged_sent]

        for idx, (word, tag, chunk) in enumerate(chked):
            pred = self.single_tag_words.get(word)
            if not pred:
                features = self._get_features(idx, chked)
                pred = self.perceptron.predict(features)
            chked[idx][2] = pred

        in_bracket = False

        for idx, (word, tag, chunk) in enumerate(chked):
            if chunk[0] == 'B' or (chunk[0] == 'E' and chunk[1] == 'B'):
                if in_bracket:
                    if chunk[0] == 'E':  # in bracket, EB
                        j = idx-1
                        while j > 0:
                            if chked[j][2][0] == 'B':
                                break
                            j -= 1
                        chked[idx][2] = 'E-' + chked[j][2][2:]
                        if tag[0] == 'V' and chked[j][2][2:][0] != 'V':
                            print 1, word, tag, 'to', chked[j][2][2:]
                        in_bracket = False
                    else:
                        chked[idx][2] = 'I-' + chked[idx-1][2][2:]
                        if tag[0] == 'V' and chked[idx-1][2][2:][0] != 'V':
                            print 2, word, tag, 'to', chked[idx-1][2][2:]
                else:
                    if not chunk[0] == 'E':
                        in_bracket = True
            elif chunk[0] == 'E':
                if in_bracket:
                    j = idx-1
                    while j > 0:
                        if chked[j][2][0] == 'B':
                            break
                        j -= 1
                    chked[idx][2] = 'E-' + chked[j][2][2:]
                    if tag[0] == 'V' and chked[j][2][2:][0] != 'V':
                            print 3, word, tag, 'to', chked[j][2][2:]
                    in_bracket = False
                else:
                    chked[idx][2] = 'EB-' + chked[idx][2][2:]
            else:
                if in_bracket:
                    if idx == len(chked)-1:
                        j = idx-1
                        while j > 0:
                            if chked[j][2][0] == 'B':
                                break
                            j -= 1
                        chked[idx][2] = 'E-' + chked[j][2][2:]
                        if tag[0] == 'V' and chked[j][2][2:][0] != 'V':
                            print 4, word, tag, 'to', chked[j][2][2:]
                    else:
                        chked[idx][2] = 'I-' + chked[idx-1][2][2:]
                        if tag[0] == 'V' and chked[idx-1][2][2:][0] != 'V':
                            print 5, word, tag, 'to', chked[idx-1][2][2:]
                else:
                    chked[idx][2] = 'O'
            if in_bracket and idx == len(chked)-1:
                j = idx-1
                while j > 0:
                    if chked[j][2][0] == 'B':
                        break
                    j -= 1
                chked[idx][2] = 'E-' + chked[j][2][2:]
                if tag[0] == 'V' and chked[j][2][2:][0] != 'V':
                    print word, tag, 'to', chked[j][2][2:]

        for idx, (word, tag, chunk) in enumerate(chked):
            if tag[0] == 'V' and chunk != 'O' and chunk.split('-')[1][0] != 'V':
                if idx != 0:
                    if chked[idx-1][2][0] == 'I':
                        chked[idx-1][2] = 'E' + chked[idx-1][2][1:]
                    elif chked[idx-1][2][0] == 'B':
                        chked[idx-1][2] = 'E' + chked[idx-1][2]
                if idx != len(chked)-1:
                    if chked[idx+1][2][0] == 'I':
                        chked[idx+1][2] = 'B' + chked[idx+1][2][1:]
                    elif chked[idx+1][2][0] == 'E' and chked[idx+1][2][1] == '-':
                        chked[idx+1][2] = 'EB' + chked[idx-1][2][1:]
                chked[idx][2] = 'EB-VP'

        for idx, (word, tag, chunk) in enumerate(chked):
            if tag[0] == 'V' and chunk != 'O' and chunk.split('-')[1][0] != 'V':
                print word, tag, 'in', chunk
        return chked

    def tag2(self, sent):
        tagged = [[self._normalize(word), None] for word in sent]

        nword = len(sent)
        ntag = len(self.chk_set)
        pi = [[[[0, None, None] for k in range(ntag)] for j in range(ntag)] for i in range(nword)]

        for i, (word, tag) in enumerate(tagged):
            pword1 = 'START1_WORD' if i <= 0 else tagged[i-1][0]
            pword2 = 'START2_WORD' if i <= 1 else tagged[i-2][0]
            fword1 = 'END1_WORD' if i >= len(sent)-1 else tagged[i+1][0]
            fword2 = 'END2_WORD' if i >= len(sent)-2 else tagged[i+2][0]
            for j, u in enumerate(self.chk_set):
                ptag2 = 'START1_TAG' if i <= 0 else u
                for k, v in enumerate(self.chk_set):
                    ptag1 = 'START2_TAG' if i <= 1 else v
                    for t, tag in enumerate(self.chk_set):
                        score = 0 if i <= 0 else pi[i-1][t][j][0]
                        score += self.perceptron.get_score(self._make_features((word, tag), (pword1, ptag1), (pword2, ptag2), fword1, fword2), tag)
                        if score > pi[i][j][k][0]:
                            pi[i][j][k][0] = score
                            pi[i][j][k][1] = tag
                            pi[i][j][k][2] = t
        i = len(tagged)-1
        t, j = None, None
        for j, u in enumerate(self.chk_set):
            for k, v in enumerate(self.chk_set):
                tag, t = pi[i][j][k][1:3]
                tagged[i][1] = tag
        i -= 1
        while i >= 0:
            tagged[i][1] = pi[i][t][j][1]
            j = t
            t = pi[i][t][j][2]
            i -= 1
        printc(tagged)

        return tagged

    def evaluate(self, chked_sents, log=False):
        ntotal = 0
        ncorrect = 0
        faults = []
        likely = {}
        faults_count = defaultdict(int)

        f = open('test.pos-chk.iob', 'w')
        for chked_sent in chked_sents:
            tagged_sent = [(word, tag) for (word, tag, chunk) in chked_sent]
            chked = self.tag(tagged_sent)

            for word, tag, chunk in chked:
                f.write('%s\t%s\n' % (tag, chunk))
            f.write('\n')
            has_false = False
            for idx, (word, tag, chunk) in enumerate(chked_sent):
                ntotal += 1
                if chunk == chked[idx][2]:
                    ncorrect += 1
                else:
                    has_false = True
            if has_false and log:
                record = []
                for idx, (word, tag, chunk) in enumerate(chked_sent):
                    if chunk == chked[idx][2]:
                        record.append((word, tag, chunk))
                    else:
                        record.append((word, tag, chunk, '【' + chked[idx][2] + '】'))
                        faults_count[chunk + ' is chked as ' + chked[idx][2]] += 1
                faults.append(record)
        f.close()

        print 'precision:', ncorrect / ntotal * 100, '%'
        sorted_fault_count = sorted(faults_count.items(), key=lambda item: item[1], reverse=True)
        for key, value in sorted_fault_count:
            print key, value
        return faults
コード例 #3
0
ファイル: tagger.py プロジェクト: ranxian/postagger
class Tagger:
    def __init__(self):
        '''map of single tag words'''
        self.single_tag_words = {}
        self.tag_set = set()
        self.perceptron = Perceptron()

    def train(self, sents, niter):
        # make single_tag_words
        self.perceptron.reset()
        tagged_sents = sents
        self._make_stw(tagged_sents)
        self.tag_set = set(tag for sent in tagged_sents for (word, tag) in sent)
        self.perceptron.tag_set = self.tag_set
        length = int(len(tagged_sents))

        for iteration in range(niter):
            ncorrect = 0
            ntotal = 0
            for sent in tagged_sents[:length]:
                sent = [(self._normalize(word), tag) for (word, tag) in sent]
                for idx, (word, tag) in enumerate(sent):
                    pred = self.single_tag_words.get(word)
                    if not pred:
                        features = self._get_features(idx, sent)
                        pred = self.perceptron.predict(features)
                        self.perceptron.update(tag, pred, features)
                    # successful prediction
                    ncorrect += pred == tag
                    ntotal += 1
            random.shuffle(tagged_sents)
            print "iteration #{0}, {1}/{2}=precision: {3}".format(iteration,
                                                          ncorrect, ntotal, ncorrect / ntotal)

        self.perceptron.average_weights()

    def _make_stw(self, tagged_sents):
        counts = defaultdict(lambda: defaultdict(int))
        for sent in tagged_sents:
            sent = [(self._normalize(word), tag) for (word, tag) in sent]
            for word, tag in sent:
                counts[word][tag] += 1

        threshold = 0.95
        freqthres = 15

        for word, tag_freqs in counts.items():
            tag, freq = max(tag_freqs.items(), key=lambda item: item[1])
            total = sum(tag_freqs.values())
            if freq >= freqthres and freq / total >= threshold:   # unambiguity
                self.single_tag_words[word] = tag
            elif tag == 'NR':
                self.single_tag_words[word] = tag
            elif freq == total and total >= 3:
                self.single_tag_words[word] = tag


        # self.single_tag_words['的'] = 'DEG'
        self.single_tag_words['-'] = 'PU'
        self.single_tag_words['--'] = 'PU'

    def _normalize(self, word):
        def isnum(word):
            return word.endswith(tuple('一 二 三 四 五 六 七 八 九 十 百 千 万 亿 两'.split(' ')))

        def iscal(word):
            return word.endswith(('年', '月', '日', '年代'))

        if (isnum(word)):
            return 'NUM'

        if (iscal(word)):
            return 'CAL'

        if (word.endswith(('省', '市', '区', '州', '县', '镇', '乡', '街'))):
            return 'LOCATION'

        return word

    # current, prev1, prev2, after1, after2
    def _make_features(self, current, prev1, prev2, fword1, fword2):
        def add(name, *args):
            features['_'.join((name, ) + tuple(args))] = 1

        word, tag = current
        pword1, ptag1 = prev1
        pword2, ptag2 = prev2

        features = defaultdict(int)
        add('bias')
        if word[0] == '*':
            add('i is v')
        else:
            add('i not v')
        add('i suffix', word[-3:])
        add('i-1 suffix', pword1[-3:])
        add('i+1 suffix', fword1[-3:])
        add('i suffix2', word[-6:])
        add('i-1 suffix', pword1[-6:])
        add('i+1 suffix', fword2[-6:])
        # add('i prefix', word[:3])
        # add('i-1 prefix', pword1[:3])
        # add('i-2 prefix', pword2[:3])
        add('i-1 tag', ptag1)
        add('i-2 tag', ptag2)
        add('i-1 i-2 tag', ptag1, ptag2)
        add('i-2 word', pword2)
        add('i-1 word', pword1)
        add('i word', word)
        add('i-2 i-1 word', fword2, fword1)
        add('i+1 word', fword1)
        add('i+2 word', fword2)
        add('i+1 i+2 word', fword1, fword2)
        add('i-1 tag i word', ptag1, word)
        add('i-2 tag i-1 word', ptag2, pword1)
        add('i word-len', str(len(word)))
        # if word != 'NUM' and word != 'CAL' and len(word) >= 6:
        #     for i in range(int(len(word) / 3)):
        #         add(str(i), ' charactor', word[i*3:(i+1)*3])

        return features

    def _get_features(self, i, sent):
        pword1, ptag1 = ('START1_WORD', 'START1_TAG') if i <= 0 else sent[i-1]
        pword2, ptag2 = ('START2_WORD', 'START2_TAG') if i <= 1 else sent[i-2]
        word, tag = sent[i]
        fword1, ftag1 = ('END1_WORD', 'END1_TAG') if i >= len(sent)-1 else sent[i+1]
        fword2, ftag2 = ('END2_WORD', 'END2_TAG') if i >= len(sent)-2 else sent[i+2]
        return self._make_features((word, tag), (pword1, ptag1), (pword2, ptag2), fword1, fword2)

    def tag(self, sent):
        original = [[word, None] for word in sent]
        tagged = [[self._normalize(word), None] for word in sent]

        for idx, (word, tag) in enumerate(tagged):
            pred = self.single_tag_words.get(word)
            if not pred:
                features = self._get_features(idx, tagged)
                pred = self.perceptron.predict(features)
            tagged[idx][1] = pred
            original[idx][1] = pred

        return original

    def tag2(self, sent):
        tagged = [[self._normalize(word), None] for word in sent]

        nword = len(sent)
        ntag = len(self.tag_set)
        pi = [[[[0, None, None] for k in range(ntag)] for j in range(ntag)] for i in range(nword)]

        for i, (word, tag) in enumerate(tagged):
            pword1 = 'START1_WORD' if i <= 0 else tagged[i-1][0]
            pword2 = 'START2_WORD' if i <= 1 else tagged[i-2][0]
            fword1 = 'END1_WORD' if i >= len(sent)-1 else tagged[i+1][0]
            fword2 = 'END2_WORD' if i >= len(sent)-2 else tagged[i+2][0]
            for j, u in enumerate(self.tag_set):
                ptag2 = 'START1_TAG' if i <= 0 else u
                for k, v in enumerate(self.tag_set):
                    ptag1 = 'START2_TAG' if i <= 1 else v
                    for t, tag in enumerate(self.tag_set):
                        score = 0 if i <= 0 else pi[i-1][t][j][0]
                        score += self.perceptron.get_score(self._make_features((word, tag), (pword1, ptag1), (pword2, ptag2), fword1, fword2), tag)
                        if score > pi[i][j][k][0]:
                            pi[i][j][k][0] = score
                            pi[i][j][k][1] = tag
                            pi[i][j][k][2] = t
        i = len(tagged)-1
        t, j = None, None
        for j, u in enumerate(self.tag_set):
            for k, v in enumerate(self.tag_set):
                tag, t = pi[i][j][k][1:3]
                tagged[i][1] = tag
        i -= 1
        while i >= 0:
            tagged[i][1] = pi[i][t][j][1]
            j = t
            t = pi[i][t][j][2]
            i -= 1
        printc(tagged)

        return tagged

    def evaluate(self, tagged_sents, log=False):
        ntotal = 0
        ncorrect = 0
        faults = []
        likely = {}
        faults_count = defaultdict(int)
        file = open('test.pos', 'w')

        for tagged_sent in tagged_sents:
            sent = [word for (word, tag) in tagged_sent]
            tagged = self.tag(sent)
            for word, tag in tagged:
                # print word, tag,
                if tag == 'NR' and not word in likely:
                    likely[word] = 'NR'

        for tagged_sent in tagged_sents:
            sent = [word for (word, tag) in tagged_sent]
            tagged = self.tag(sent)
            # for idx, (word, tag) in enumerate(tagged):
            #     if word in likely:
            #         tagged[idx][1] = 'NR'
            has_false = False

            for (word, tag) in tagged:
                word = word[1:] if word[0] == '*' else word
                file.write('%s\t%s\n' % (word, tag))
                
            file.write('\n')

            if log:
                for idx, (word, tag) in enumerate(tagged_sent):
                    ntotal += 1
                    if tag == tagged[idx][1] or (tag[0] == 'N' and tagged[idx][1][0] == 'N') or (tag == 'DEC' and tagged[idx][1] == 'DEG') or \
                        (tag == 'DEG' and tagged[idx][1] == 'DEC') or ((tag[0] == 'V' and tagged[idx][1][0] == 'V')):
                        ncorrect += 1
                    else:
                        has_false = True
            if log:
                if has_false:
                    record = []
                    for idx, (word, tag) in enumerate(tagged_sent):
                        if tag == tagged[idx][1] or (tag[0] == 'N' and tagged[idx][1][0] == 'N') or (tag == 'DEC' and tagged[idx][1] == 'DEG') or \
                        (tag == 'DEG' and tagged[idx][1] == 'DEC') or (tag[0] == 'V' and tagged[idx][1][0] == 'V'):
                            record.append((word, tag, tagged[idx][1]))
                        else:
                            record.append((word, tag, '【' + tagged[idx][1] + '】'))
                            faults_count[tag + ' is tagged as ' + tagged[idx][1]] += 1
                    faults.append(record)

        if log:
            print 'precision:', ncorrect / ntotal * 100, '%'
        file.close()
        if log:
            sorted_fault_count = sorted(faults_count.items(), key=lambda item: item[1], reverse=True)
            for key, value in sorted_fault_count:
                print key, value
        return faults