Python MultiPerceptron Examples

Programming Language: Python

Namespace/Package Name: MultiPerceptron

Class/Type: MultiPerceptron

Examples at hotexamples.com: 7

Python MultiPerceptron - 7 examples found. These are the top rated real world Python examples of MultiPerceptron.MultiPerceptron extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MultiPerceptron(1)

loadFeatSize(1)

printinfo(1)

read_train_file(1)

saveModel(1)

scoreout(1)

setSavePath(1)

train_sgd(1)

Example #1

Show file

    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()  # the tag of corpus
        self.corpus_num = 0
        self.state = ['B', 'M', 'E', 'S']
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        self.trans_prb = {
            'B': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'M': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'E': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'S': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            }
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5**5
        self.path = r'./'

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r'./resources/Chinese_num.txt', 'r')
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r'./resources/names.txt', 'r')
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()

Example #2

Show file

File: CWSPv6.py Project: heshenghuan/CWSperceptron

    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()     # the tag of corpus
        self.corpus_num = 0
        self.state = ['B', 'M', 'E', 'S']
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        self.trans_prb = {
            'B': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'M': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'E': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'S': {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.trigram_feat_num = 0
        self.trigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5**5
        self.path = r'./'

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r'./resources/Chinese_num.txt', 'r')
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r'./resources/names.txt', 'r')
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()

Example #3

Show file

File: CWSPv4.py Project: heshenghuan/CWSperceptron

    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()  # the tag of corpus
        self.corpus_num = 0
        self.state = ["B", "M", "E", "S"]
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {"B": 0, "M": 0, "E": 0, "S": 0}
        self.trans_prb = {
            "B": {"B": 0, "M": 0, "E": 0, "S": 0},
            "M": {"B": 0, "M": 0, "E": 0, "S": 0},
            "E": {"B": 0, "M": 0, "E": 0, "S": 0},
            "S": {"B": 0, "M": 0, "E": 0, "S": 0},
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5 ** 5
        self.path = r"./"

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r"./resources/Chinese_num.txt", "r")
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r"./resources/names.txt", "r")
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()

Example #4

Show file

File: CWSPv6.py Project: zhulin0808/CWSperceptron

class CWSPerceptron:
    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()  # the tag of corpus
        self.corpus_num = 0
        self.state = ['B', 'M', 'E', 'S']
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        self.trans_prb = {
            'B': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'M': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'E': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'S': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            }
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.trigram_feat_num = 0
        self.trigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5**5
        self.path = r'./'

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r'./resources/Chinese_num.txt', 'r')
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r'./resources/names.txt', 'r')
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()

    def setSavePath(self, path):
        self.path = path
        self.perceptron.setSavePath(path)

    def saveModel(self):
        print "Saving the unigram&bigram infomation......"
        output1 = open(self.path + r"bigram_feat_id.pkl", 'wb')
        dump(self.bigram_feat_id, output1, -1)
        output1.close()
        output2 = open(self.path + r"unigram_feat_id.pkl", 'wb')
        dump(self.unigram_feat_id, output2, -1)
        output2.close()
        output3 = open(self.path + r"trigram_feat_id.pkl", 'wb')
        dump(self.trigram_feat_id, output3, -1)
        output3.close()
        output4 = open(self.path + r"dict_feat_id.pkl", 'wb')
        dump(self.dict_feat_id, output4, -1)
        output4.close()

        # release the memory
        self.unigram_feat_id = []
        self.bigram_feat_id = []
        self.trigram_feat_id = []
        self.corpus = []
        self.tag = []
        print "Saving the inital prb & trans prb infomation....."
        output1 = open(self.path + r"init_prb.pkl", 'wb')
        dump(self.init_prb, output1, -1)
        output1.close()
        output2 = open(self.path + r"trans_prb.pkl", 'wb')
        dump(self.trans_prb, output2, -1)
        output2.close()
        print "Saving process done."

    def loadModel(self):
        print "Loading the unigram&bigram infomation......"
        inputs = open(self.path + r"bigram_feat_id.pkl", 'rb')
        self.bigram_feat_id = load(inputs)
        self.bigram_feat_num = len(self.bigram_feat_id)
        inputs.close()

        inputs1 = open(self.path + r"unigram_feat_id.pkl", 'rb')
        self.unigram_feat_id = load(inputs1)
        self.unigram_feat_num = len(self.unigram_feat_id)
        inputs1.close()

        inputs2 = open(self.path + r"dict_feat_id.pkl", 'rb')
        self.dict_feat_id = load(inputs2)
        self.dict_feat_num = len(self.dict_feat_id)
        inputs2.close()

        inputs3 = open(self.path + r"trigram_feat_id.pkl", 'rb')
        self.trigram_feat_id = load(inputs3)
        self.trigram_feat_num = len(self.trigram_feat_id)
        inputs3.close()
        # print "Loading process done."
        print "Loading the prb infomation......"
        inputs = open(self.path + r"init_prb.pkl", 'rb')
        self.init_prb = load(inputs)
        inputs.close()
        inputs1 = open(self.path + r"trans_prb.pkl", 'rb')
        self.trans_prb = load(inputs1)
        inputs1.close()
        print "Loading process done."
        self.dimension = self.unigram_feat_num * 5 + self.bigram_feat_num * 4\
            + self.trigram_feat_num + self.dict_feat_num * 4 + self.type_feat_num

    def loadDict(self, dictfile):
        self.dict.loadDict(dictfile)

    def saveDict(self, outfile):
        self.dict.saveDict(outfile)

    def readDict(self, dictfile):
        self.dict.readDict(dictfile)

    def appendDict(self, dictfile):
        self.dict.appendDict(dictfile)

    def segmentation(self, outfile):
        output = codecs.open(outfile, 'w', 'utf-8')
        start = time.clock()
        for i in range(self.corpus_num):
            taglist = self.ViterbiDecode(self.corpus[i])
            wordlist = self.tag2word(self.corpus[i], taglist)
            for j in range(len(wordlist)):
                output.write(wordlist[j])
                output.write(' ')
            output.write("\n")
        print "Decode:", time.clock() - start
        output.close()

    def train(self,
              trainfile,
              batch_num=100,
              max_iter=200,
              learn_rate=1.0,
              delta_thrd=0.001,
              is_average=True):
        # self.makelibsvmdata(r'train.data',max_corpus)
        print "Start training process."
        self.perceptron.loadFeatSize(self.dimension, len(self.state))
        self.perceptron.read_train_file(trainfile)
        self.perceptron.printinfo()
        self.perceptron.train_sgd(max_iter, learn_rate, delta_thrd, is_average)
        self.perceptron.saveModel()
        print "Training process done."
        print "Multi-class Perceptron Model had been saved."

    def printstr(self, wordlist):
        for item in wordlist:
            print item
        print " "

    def makeLibSvmData(self, output_file, corpus_num=-1):
        print "Making training data.",
        filecount = 1
        output_data = codecs.open(output_file, 'w')
        if corpus_num == -1:
            corpus_num = self.corpus_num
        for i in range(corpus_num):
            taglist = self.tag[i]
            features = self.GetFeature(self.corpus[i])
            vec = self.Feature2Vec(features)
            for j in range(len(taglist)):
                output_data.write(str(self.state.index(taglist[j])))
                output_data.write('\t')
                keyset = list(vec[j].keys())
                keyset = sorted(keyset)
                if len(keyset) < 1:
                    output_data.write('0:1')
                for key in keyset:
                    output_data.write(str(key))
                    output_data.write(':')
                    output_data.write(str(vec[j][key]))
                    output_data.write(' ')
                output_data.write("\n")
        output_data.close()
        print "\nMaking training data finished."
        return filecount

    def classifiy_score(self, featureVec):
        tmp = self.perceptron.scoreout(featureVec)
        ans = {}
        for key in tmp.keys():
            ans[self.state[int(key)]] = tmp[key]
        # return self.perceptron.scoreout(featureVec)
        return ans
        # return self.perceptron.probout(featureVec)

    def getEmitPrb(self, score):
        """
        Get emits_prb use softmax function
        """
        max_score = max(score.values())
        emit_prb = {}
        expsum = 0.
        for key in score.keys():
            emit_prb[key] = math.exp(score[key] - max_score)
            expsum += emit_prb[key]
        for key in score.keys():
            emit_prb[key] /= expsum
            emit_prb[key] = math.log(emit_prb[key])
        return emit_prb

    def ViterbiDecode(self, sentence):
        N = len(sentence)  # length of the sentence
        prb = 0.
        prb_max = 0.
        toward = list()
        back = list()

        # get the feature Vector of every single character
        features = self.GetFeature(sentence)
        vec = self.Feature2Vec(features)

        for i in range(N):
            toward.append({})
            back.append({})
            for j in self.state:
                toward[i][j] = float('-inf')
                back[i][j] = ' '

        # run viterbi
        score = self.classifiy_score(vec[0])
        emit_prb = self.getEmitPrb(score)
        # print emit_prb
        for s in self.state:
            toward[0][s] = self.init_prb[s] + emit_prb[s]
            back[0][s] = 'end'
        # toward algorithm
        for t in range(1, N):
            score = self.classifiy_score(vec[t])
            # print score
            emit_prb = self.getEmitPrb(score)
            for s in self.state:
                prb = float('-inf')
                prb_max = float('-inf')
                state_max = 'S'
                for i in self.state:
                    prb = toward[t - 1][i] + self.trans_prb[i][s] + emit_prb[s]
                    if prb > prb_max:
                        prb_max = prb
                        state_max = i
                toward[t][s] = prb_max
                back[t][s] = state_max
        # backward algorithm to get the best tag sequence
        index = N - 1
        taglist = []
        prb_max = float('-inf')
        state_max = ''
        for s in self.state:
            prb = toward[N - 1][s]
            if prb > prb_max:
                prb_max = prb
                state_max = s
        taglist.append(state_max)
        while index >= 1:
            pre_state = back[index][taglist[0]]
            taglist.insert(0, pre_state)
            index -= 1
        if taglist[-1] == 'B':
            taglist[-1] = 'S'
        elif taglist[-1] == 'M':
            taglist[-1] == 'E'
        return taglist

    def GetFeature(self, sent):
        """
        get feature for every single character
        return a list of features
        """
        features = []

        for i in range(len(sent)):
            left2 = sent[i - 2] if i - 2 >= 0 else '#'
            left1 = sent[i - 1] if i - 1 >= 0 else '#'
            mid = sent[i]
            right1 = sent[i + 1] if i + 1 < len(sent) else '#'
            right2 = sent[i + 2] if i + 2 < len(sent) else '#'
            # print self.dict.dic.has_key(mid),
            if self.dict.dic.has_key(mid):
                MWL = str(self.dict.dic[mid][0])
                t0 = self.dict.dic[mid][1]
                # print MWL,t0
            else:
                MWL = '0'
                t0 = '#'
            # print MWL,t0
            featcode = 0
            chars = [left2, left1, mid, right1, right2]
            for i in range(len(chars)):
                if chars[i].encode('utf-8') in self.nums:
                    featcode += 0
                elif chars[i] in self.dates:
                    featcode += 5**i
                elif (u"a" <= chars[i]
                      and chars[i] <= u"z") or (u"A" <= chars[i]
                                                and chars[i] <= u"Z"):
                    featcode += 5**i * 2
                elif chars[i].encode('utf-8') in self.names:
                    featcode += 5**i * 3
                else:
                    featcode += 5**i * 4
            featcode += 1
            feat = [
                left2, left1, mid, right1, right2, left2 + left1, left1 + mid,
                mid + right1, right1 + right2, left1 + right1, MWL + t0,
                left1 + t0, mid + t0, right1 + t0, featcode
            ]
            features.append(feat)

        return features

    def Feature2Vec(self, feats):
        """
        get feature vector from feature
        the paramters feats mean is a list of features of every character
        """
        punctuation = [
            u'。', u'，', u'？', u'！', u'、', u'；', u'：', u'「', '」', u'『', u'』',
            u'‘', u'’', u'“', u'”', u'（', u'）', u'〔', u'〕', u'【', u'】', u'——',
            u'–', u'…', u'．', u'·', u'《', u'》', u'〈', u'〉'
        ]
        featVecs = []
        for feat in feats:
            featVec = {}
            # if feat[2] in punctuation:
            #     featVec[0] = 1
            for it in range(len(feat)):
                if it < 5:
                    if self.unigram_feat_id.has_key(feat[it]):
                        key = self.unigram_feat_id[
                            feat[it]] + self.unigram_feat_num * it
                        featVec[key] = 1
                elif it < 9:
                    if self.bigram_feat_id.has_key(feat[it]):
                        key = self.bigram_feat_id[feat[it]]
                        key += self.unigram_feat_num * 5 + \
                            self.bigram_feat_num * (it - 4)
                        featVec[key] = 1
                elif it < 10:
                    if self.trigram_feat_id.has_key(feat[it]):
                        key = self.trigram_feat_id[feat[it]]
                        key += self.unigram_feat_num * 5 + self.bigram_feat_num * 4
                elif it < 14:
                    if self.dict_feat_id.has_key(feat[it]):
                        key = self.dict_feat_id[feat[it]]
                        key += self.unigram_feat_num * 5 + self.bigram_feat_num * 4 + self.trigram_feat_num + self.dict_feat_num * (
                            it - 10)
                        featVec[key] = 1
                else:
                    key = feat[it]
                    key += self.unigram_feat_num * 5 + self.bigram_feat_num * 4 + self.trigram_feat_num + self.dict_feat_num * 4
                    featVec[key] = 1
            featVecs.append(featVec)

        return featVecs

    def getTag(self, wordlist):
        """get the tag for every char in the word"""
        taglist = []
        for word in wordlist:
            if len(word) == 1:
                taglist.append('S')
            else:
                taglist.append('B')
                for w in word[1:len(word) - 1]:
                    taglist.append('M')
                taglist.append('E')
        return taglist

    def tag2word(self, sentence, taglist):
        wordlist = []
        tmp = ''
        for i in range(len(taglist)):
            if taglist[i] == 'S':
                tmp = sentence[i]
                wordlist.append(tmp)
                tmp = ''
            elif taglist[i] == 'B':
                tmp += sentence[i]
            elif taglist[i] == 'M':
                tmp += sentence[i]
            else:
                tmp += sentence[i]
                wordlist.append(tmp)
                tmp = ''
        return wordlist

    def loadCorpus(self, corpus_file):
        print "Loading Corpus data",
        input_data = codecs.open(corpus_file, 'r', 'utf-8')
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == '':
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print '.',
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
        print "\nLoading Corpus done."

    def pretreatment(self, train_file):
        print "The process of corpus Pretreatment",
        input_data = codecs.open(train_file, 'r', 'utf-8')
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == '':
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print '.',
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
            self.init_prb[taglist[0]] += 1
            for t in range(1, len(taglist)):
                self.trans_prb[taglist[t - 1]][taglist[t]] += 1

            feats = self.GetFeature(sentence)
            # record the feats, allocate the id of feature
            for feat in feats:
                for it in range(len(feat)):
                    if it < 5:  # unigram feature
                        if not self.unigram_feat_id.has_key(feat[it]):
                            self.unigram_feat_num += 1
                            self.unigram_feat_id[
                                feat[it]] = self.unigram_feat_num
                    elif it < 9:  # bigram feature
                        if not self.bigram_feat_id.has_key(feat[it]):
                            self.bigram_feat_num += 1
                            self.bigram_feat_id[
                                feat[it]] = self.bigram_feat_num
                    elif it < 10:  # trigram feature
                        if not self.trigram_feat_id.has_key(feat[it]):
                            self.trigram_feat_num += 1
                            self.trigram_feat_id[
                                feat[it]] = self.trigram_feat_num
                    elif it < 14:  # dictionary information feature
                        if not self.dict_feat_id.has_key(feat[it]):
                            self.dict_feat_num += 1
                            self.dict_feat_id[feat[it]] = self.dict_feat_num

        # calculate the probability of tag
        initsum = sum(self.init_prb.values())
        for key in self.init_prb.keys():
            self.init_prb[key] = float(self.init_prb[key]) / initsum
        for x in self.trans_prb.keys():
            tmpsum = sum(self.trans_prb[x].values())
            for y in self.trans_prb[x].keys():
                self.trans_prb[x][y] = float(self.trans_prb[x][y]) / tmpsum
        self.dimension = self.unigram_feat_num * 5 + \
            self.bigram_feat_num * 4 + self.trigram_feat_num + self.dict_feat_num * 4 + self.type_feat_num
        # calc the log probability
        for s in self.state:
            if self.init_prb[s] != 0.:
                self.init_prb[s] = math.log(self.init_prb[s])
            else:
                self.init_prb[s] = float('-inf')
            for j in self.state:
                if self.trans_prb[s][j] != 0.:
                    self.trans_prb[s][j] = math.log(self.trans_prb[s][j])
                else:
                    self.trans_prb[s][j] = float('-inf')
        print "\nProcess of pretreatment finished."

Example #5

Show file

File: CWSPv4.py Project: heshenghuan/CWSperceptron

class CWSPerceptron:
    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()  # the tag of corpus
        self.corpus_num = 0
        self.state = ["B", "M", "E", "S"]
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {"B": 0, "M": 0, "E": 0, "S": 0}
        self.trans_prb = {
            "B": {"B": 0, "M": 0, "E": 0, "S": 0},
            "M": {"B": 0, "M": 0, "E": 0, "S": 0},
            "E": {"B": 0, "M": 0, "E": 0, "S": 0},
            "S": {"B": 0, "M": 0, "E": 0, "S": 0},
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5 ** 5
        self.path = r"./"

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r"./resources/Chinese_num.txt", "r")
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r"./resources/names.txt", "r")
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()

    def setSavePath(self, path):
        self.path = path
        self.perceptron.setSavePath(path)

    def saveModel(self):
        print "Saving the unigram&bigram infomation......"
        output1 = open(self.path + r"bigram_feat_id.pkl", "wb")
        dump(self.bigram_feat_id, output1, -1)
        output1.close()
        output2 = open(self.path + r"unigram_feat_id.pkl", "wb")
        dump(self.unigram_feat_id, output2, -1)
        output2.close()
        output3 = open(self.path + r"dict_feat_id.pkl", "wb")
        dump(self.dict_feat_id, output3, -1)
        output3.close()

        # release the memory
        self.unigram_feat_id = []
        self.bigram_feat_id = []
        self.corpus = []
        self.tag = []
        print "Saving the inital prb & trans prb infomation....."
        output1 = open(self.path + r"init_prb.pkl", "wb")
        dump(self.init_prb, output1, -1)
        output1.close()
        output2 = open(self.path + r"trans_prb.pkl", "wb")
        dump(self.trans_prb, output2, -1)
        output2.close()
        print "Saving process done."

    def loadModel(self):
        print "Loading the unigram&bigram infomation......"
        inputs = open(self.path + r"bigram_feat_id.pkl", "rb")
        self.bigram_feat_id = load(inputs)
        self.bigram_feat_num = len(self.bigram_feat_id)
        inputs.close()
        inputs1 = open(self.path + r"unigram_feat_id.pkl", "rb")
        self.unigram_feat_id = load(inputs1)
        self.unigram_feat_num = len(self.unigram_feat_id)
        inputs1.close()
        inputs2 = open(self.path + r"dict_feat_id.pkl", "rb")
        self.dict_feat_id = load(inputs2)
        self.dict_feat_num = len(self.dict_feat_id)
        # print "Loading process done."
        print "Loading the prb infomation......"
        inputs = open(self.path + r"init_prb.pkl", "rb")
        self.init_prb = load(inputs)
        inputs.close()
        inputs1 = open(self.path + r"trans_prb.pkl", "rb")
        self.trans_prb = load(inputs1)
        inputs1.close()
        print "Loading process done."
        self.dimension = (
            self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num
        )

    def loadDict(self, dictfile):
        self.dict.loadDict(dictfile)

    def saveDict(self, outfile):
        self.dict.saveDict(outfile)

    def readDict(self, dictfile):
        self.dict.readDict(dictfile)

    def appendDict(self, dictfile):
        self.dict.appendDict(dictfile)

    def evaluate(self, corpus=200):
        error_count = 0
        tagnums = sum([len(item) for item in self.tag[0:corpus]])
        for i in range(corpus):
            tag = self.ViterbiDecode(self.corpus[i])
            # print 'y:',self.tag[i]
            # print 'p:',tag
            for index in range(len(tag)):
                pre = tag[index]
                # print self.tag[j]
                real = self.tag[i][index]
                # print pre, real
                if pre != real:
                    error_count += 1
        return 1 - float(error_count) / tagnums

    def segmentation(self, outfile):
        output = codecs.open(outfile, "w", "utf-8")
        start = time.clock()
        for i in range(self.corpus_num):
            taglist = self.ViterbiDecode(self.corpus[i])
            wordlist = self.tag2word(self.corpus[i], taglist)
            for j in range(len(wordlist)):
                output.write(wordlist[j])
                output.write(" ")
            output.write("\n")
        print "Decode:", time.clock() - start
        output.close()

    def train(self, trainfile, batch_num=100, max_iter=200, learn_rate=1.0, delta_thrd=0.001, is_average=True):
        # self.makelibsvmdata(r'train.data',max_corpus)
        print "Start training process."
        self.perceptron.loadFeatSize(self.dimension, len(self.state))
        self.perceptron.read_train_file(trainfile)
        self.perceptron.printinfo()
        self.perceptron.train_sgd(max_iter, learn_rate, delta_thrd, is_average)
        self.perceptron.saveModel()
        print "Training process done."
        print "Multi-class Perceptron Model had been saved."

    def printstr(self, wordlist):
        for item in wordlist:
            print item
        print " "

    def makeLibSvmData(self, output_file, corpus_num=-1):
        print "Making training data.",
        filecount = 1
        output_data = codecs.open(output_file, "w")
        if corpus_num == -1:
            corpus_num = self.corpus_num
        for i in range(corpus_num):
            taglist = self.tag[i]
            features = self.GetFeature(self.corpus[i])
            vec = self.Feature2Vec(features)
            for j in range(len(taglist)):
                output_data.write(str(self.state.index(taglist[j])))
                output_data.write("\t")
                keyset = list(vec[j].keys())
                keyset = sorted(keyset)
                if len(keyset) < 1:
                    output_data.write("0:1")
                for key in keyset:
                    output_data.write(str(key))
                    output_data.write(":")
                    output_data.write(str(vec[j][key]))
                    output_data.write(" ")
                output_data.write("\n")
        output_data.close()
        print "\nMaking training data finished."
        return filecount

    def classifiy_score(self, featureVec):
        tmp = self.perceptron.scoreout(featureVec)
        ans = {}
        for key in tmp.keys():
            ans[self.state[int(key)]] = tmp[key]
        # return self.perceptron.scoreout(featureVec)
        return ans
        # return self.perceptron.probout(featureVec)

    def getEmitPrb(self, score):
        """
        Get emits_prb use softmax function
        """
        max_score = max(score.values())
        emit_prb = {}
        expsum = 0.0
        for key in score.keys():
            emit_prb[key] = math.exp(score[key] - max_score)
            expsum += emit_prb[key]
        for key in score.keys():
            emit_prb[key] /= expsum
            emit_prb[key] = math.log(emit_prb[key])
        return emit_prb

    def ViterbiDecode(self, sentence):
        N = len(sentence)  # length of the sentence
        prb = 0.0
        prb_max = 0.0
        toward = list()
        back = list()

        # get the feature Vector of every single character
        features = self.GetFeature(sentence)
        vec = self.Feature2Vec(features)

        for i in range(N):
            toward.append({})
            back.append({})
            for j in self.state:
                toward[i][j] = float("-inf")
                back[i][j] = " "

        # run viterbi
        score = self.classifiy_score(vec[0])
        emit_prb = self.getEmitPrb(score)
        # print emit_prb
        for s in self.state:
            toward[0][s] = self.init_prb[s] + emit_prb[s]
            back[0][s] = "end"
        # toward algorithm
        for t in range(1, N):
            score = self.classifiy_score(vec[t])
            # print score
            emit_prb = self.getEmitPrb(score)
            for s in self.state:
                prb = float("-inf")
                prb_max = float("-inf")
                state_max = "S"
                for i in self.state:
                    prb = toward[t - 1][i] + self.trans_prb[i][s] + emit_prb[s]
                    if prb > prb_max:
                        prb_max = prb
                        state_max = i
                toward[t][s] = prb_max
                back[t][s] = state_max
        # backward algorithm to get the best tag sequence
        index = N - 1
        taglist = []
        prb_max = float("-inf")
        state_max = ""
        for s in self.state:
            prb = toward[N - 1][s]
            if prb > prb_max:
                prb_max = prb
                state_max = s
        taglist.append(state_max)
        while index >= 1:
            pre_state = back[index][taglist[0]]
            taglist.insert(0, pre_state)
            index -= 1
        if taglist[-1] == "B":
            taglist[-1] = "S"
        elif taglist[-1] == "M":
            taglist[-1] == "E"
        return taglist

    def GetFeature(self, sent):
        """
        get feature for every single character
        return a list of features
        """
        features = []

        for i in range(len(sent)):
            left2 = sent[i - 2] if i - 2 >= 0 else "#"
            left1 = sent[i - 1] if i - 1 >= 0 else "#"
            mid = sent[i]
            right1 = sent[i + 1] if i + 1 < len(sent) else "#"
            right2 = sent[i + 2] if i + 2 < len(sent) else "#"
            # print self.dict.dic.has_key(mid),
            if self.dict.dic.has_key(mid):
                MWL = str(self.dict.dic[mid][0])
                t0 = self.dict.dic[mid][1]
                # print MWL,t0
            else:
                MWL = "0"
                t0 = "#"
            # print MWL,t0
            featcode = 0
            chars = [left2, left1, mid, right1, right2]
            for i in range(len(chars)):
                if chars[i].encode("utf-8") in self.nums:
                    featcode += 0
                elif chars[i] in self.dates:
                    featcode += 5 ** i
                elif (u"a" <= chars[i] and chars[i] <= u"z") or (u"A" <= chars[i] and chars[i] <= u"Z"):
                    featcode += 5 ** i * 2
                elif chars[i].encode("utf-8") in self.names:
                    featcode += 5 ** i * 3
                else:
                    featcode += 5 ** i * 4
            featcode += 1
            feat = [
                left2,
                left1,
                mid,
                right1,
                right2,
                left2 + left1,
                left1 + mid,
                mid + right1,
                right1 + right2,
                left1 + right1,
                MWL + t0,
                left1 + t0,
                mid + t0,
                right1 + t0,
                featcode,
            ]
            features.append(feat)

        return features

    def Feature2Vec(self, feats):
        """
        get feature vector from feature
        the paramters feats mean is a list of features of every character
        """
        punctuation = [
            u"。",
            u"，",
            u"？",
            u"！",
            u"、",
            u"；",
            u"：",
            u"「",
            "」",
            u"『",
            u"』",
            u"‘",
            u"’",
            u"“",
            u"”",
            u"（",
            u"）",
            u"〔",
            u"〕",
            u"【",
            u"】",
            u"——",
            u"–",
            u"…",
            u"．",
            u"·",
            u"《",
            u"》",
            u"〈",
            u"〉",
        ]
        featVecs = []
        for feat in feats:
            featVec = {}
            # if feat[2] in punctuation:
            #     featVec[0] = 1
            for it in range(len(feat)):
                if it < 5:
                    if self.unigram_feat_id.has_key(feat[it]):
                        key = self.unigram_feat_id[feat[it]] + self.unigram_feat_num * it
                        featVec[key] = 1
                elif it < 10:
                    if self.bigram_feat_id.has_key(feat[it]):
                        key = self.bigram_feat_id[feat[it]]
                        key += self.unigram_feat_num * 5 + self.bigram_feat_num * (it - 5)
                        featVec[key] = 1
                elif it < 14:
                    if self.dict_feat_id.has_key(feat[it]):
                        key = self.dict_feat_id[feat[it]]
                        key += self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * (it - 10)
                        featVec[key] = 1
                else:
                    key = feat[it]
                    key += self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * 4
                    featVec[key] = 1
            featVecs.append(featVec)

        return featVecs

    def getTag(self, wordlist):
        """get the tag for every char in the word"""
        taglist = []
        for word in wordlist:
            if len(word) == 1:
                taglist.append("S")
            else:
                taglist.append("B")
                for w in word[1 : len(word) - 1]:
                    taglist.append("M")
                taglist.append("E")
        return taglist

    def tag2word(self, sentence, taglist):
        wordlist = []
        tmp = ""
        for i in range(len(taglist)):
            if taglist[i] == "S":
                tmp = sentence[i]
                wordlist.append(tmp)
                tmp = ""
            elif taglist[i] == "B":
                tmp += sentence[i]
            elif taglist[i] == "M":
                tmp += sentence[i]
            else:
                tmp += sentence[i]
                wordlist.append(tmp)
                tmp = ""
        return wordlist

    def loadTestCorpus(self, corpus_file):
        print "Loading Test Corpus data",
        input_data = codecs.open(corpus_file, "r", "utf-8")
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == "":
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print ".",
            wordlist = rawText.split()
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
        print "\nLoading Test Corpus done."

    def loadCorpus(self, corpus_file):
        print "Loading Corpus data",
        input_data = codecs.open(corpus_file, "r", "utf-8")
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == "":
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print ".",
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
        print "\nLoading Corpus done."

    def pretreatment(self, train_file):
        print "The process of corpus Pretreatment",
        input_data = codecs.open(train_file, "r", "utf-8")
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == "":
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print ".",
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
            self.init_prb[taglist[0]] += 1
            for t in range(1, len(taglist)):
                self.trans_prb[taglist[t - 1]][taglist[t]] += 1

            feats = self.GetFeature(sentence)
            # record the feats, allocate the id of feature
            for feat in feats:
                for it in range(len(feat)):
                    if it < 5:  # unigram feature
                        if not self.unigram_feat_id.has_key(feat[it]):
                            self.unigram_feat_num += 1
                            self.unigram_feat_id[feat[it]] = self.unigram_feat_num
                    elif it < 10:  # bigram feature
                        if not self.bigram_feat_id.has_key(feat[it]):
                            self.bigram_feat_num += 1
                            self.bigram_feat_id[feat[it]] = self.bigram_feat_num
                    elif it < 14:  # dictionary information feature
                        if not self.dict_feat_id.has_key(feat[it]):
                            self.dict_feat_num += 1
                            self.dict_feat_id[feat[it]] = self.dict_feat_num

        # calculate the probability of tag
        initsum = sum(self.init_prb.values())
        for key in self.init_prb.keys():
            self.init_prb[key] = float(self.init_prb[key]) / initsum
        for x in self.trans_prb.keys():
            tmpsum = sum(self.trans_prb[x].values())
            for y in self.trans_prb[x].keys():
                self.trans_prb[x][y] = float(self.trans_prb[x][y]) / tmpsum
        self.dimension = (
            self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num
        )
        # calc the log probability
        for s in self.state:
            if self.init_prb[s] != 0.0:
                self.init_prb[s] = math.log(self.init_prb[s])
            else:
                self.init_prb[s] = float("-inf")
            for j in self.state:
                if self.trans_prb[s][j] != 0.0:
                    self.trans_prb[s][j] = math.log(self.trans_prb[s][j])
                else:
                    self.trans_prb[s][j] = float("-inf")
        print "\nProcess of pretreatment finished."

Example #6

Show file

File: CWSPv4.py Project: njnubobo/CWSperceptron

class CWSPerceptron:

    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()     # the tag of corpus
        self.corpus_num = 0
        self.state = ['B', 'M', 'E', 'S']
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        self.trans_prb = {
            'B': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'M': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'E': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'S': {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5**5
        self.path = r'./'

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r'Chinese_num.txt', 'r')
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r'names.txt', 'r')
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()

    def setSavePath(self, path):
        self.path = path
        self.perceptron.setSavePath(path)

    def saveModel(self):
        print "Saving the unigram&bigram infomation......"
        output1 = open(self.path + r"bigram_feat_id.pkl", 'wb')
        dump(self.bigram_feat_id, output1, -1)
        output1.close()
        output2 = open(self.path + r"unigram_feat_id.pkl", 'wb')
        dump(self.unigram_feat_id, output2, -1)
        output2.close()
        output3 = open(self.path + r"dict_feat_id.pkl", 'wb')
        dump(self.dict_feat_id, output3, -1)
        output3.close()

        # release the memory
        self.unigram_feat_id = []
        self.bigram_feat_id = []
        self.corpus = []
        self.tag = []
        print "Saving the inital prb & trans prb infomation....."
        output1 = open(self.path + r"init_prb.pkl", 'wb')
        dump(self.init_prb, output1, -1)
        output1.close()
        output2 = open(self.path + r"trans_prb.pkl", 'wb')
        dump(self.trans_prb, output2, -1)
        output2.close()
        print "Saving process done."

    def loadModel(self):
        print "Loading the unigram&bigram infomation......"
        inputs = open(self.path + r"bigram_feat_id.pkl", 'rb')
        self.bigram_feat_id = load(inputs)
        self.bigram_feat_num = len(self.bigram_feat_id)
        inputs.close()
        inputs1 = open(self.path + r"unigram_feat_id.pkl", 'rb')
        self.unigram_feat_id = load(inputs1)
        self.unigram_feat_num = len(self.unigram_feat_id)
        inputs1.close()
        inputs2 = open(self.path + r"dict_feat_id.pkl", 'rb')
        self.dict_feat_id = load(inputs2)
        self.dict_feat_num = len(self.dict_feat_id)
        # print "Loading process done."
        print "Loading the prb infomation......"
        inputs = open(self.path + r"init_prb.pkl", 'rb')
        self.init_prb = load(inputs)
        inputs.close()
        inputs1 = open(self.path + r"trans_prb.pkl", 'rb')
        self.trans_prb = load(inputs1)
        inputs1.close()
        print "Loading process done."
        self.dimension = self.unigram_feat_num * 5 + \
            self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num

    def loadDict(self, dictfile):
        self.dict.loadDict(dictfile)

    def saveDict(self, outfile):
        self.dict.saveDict(outfile)

    def readDict(self, dictfile):
        self.dict.readDict(dictfile)

    def appendDict(self, dictfile):
        self.dict.appendDict(dictfile)

    def evaluate(self, corpus=200):
        error_count = 0
        tagnums = sum([len(item) for item in self.tag[0:corpus]])
        for i in range(corpus):
            tag = self.ViterbiDecode(self.corpus[i])
            # print 'y:',self.tag[i]
            # print 'p:',tag
            for index in range(len(tag)):
                pre = tag[index]
                # print self.tag[j]
                real = self.tag[i][index]
                # print pre, real
                if pre != real:
                    error_count += 1
        return 1 - float(error_count) / tagnums

    def segmentation(self, outfile):
        output = codecs.open(outfile, 'w', 'utf-8')
        start = time.clock()
        for i in range(self.corpus_num):
            taglist = self.ViterbiDecode(self.corpus[i])
            wordlist = self.tag2word(self.corpus[i], taglist)
            for j in range(len(wordlist)):
                output.write(wordlist[j])
                output.write(' ')
            output.write("\n")
        print "Decode:", time.clock() - start
        output.close()

    def train(self, trainfile, batch_num=100, max_iter=200, learn_rate=1.0,
              delta_thrd=0.001, is_average=True):
        # self.makelibsvmdata(r'train.data',max_corpus)
        print "Start training process."
        self.perceptron.loadFeatSize(self.dimension, len(self.state))
        self.perceptron.read_train_file(trainfile)
        self.perceptron.printinfo()
        self.perceptron.train_sgd(max_iter, learn_rate, delta_thrd, is_average)
        self.perceptron.saveModel()
        print "Training process done."
        print "Multi-class Perceptron Model had been saved."

    def printstr(self, wordlist):
        for item in wordlist:
            print item
        print " "

    def makeLibSvmData(self, output_file, corpus_num=-1):
        print "Making training data.",
        filecount = 1
        output_data = codecs.open(output_file, 'w')
        if corpus_num == -1:
            corpus_num = self.corpus_num
        for i in range(corpus_num):
            taglist = self.tag[i]
            features = self.GetFeature(self.corpus[i])
            vec = self.Feature2Vec(features)
            for j in range(len(taglist)):
                output_data.write(str(self.state.index(taglist[j])))
                output_data.write('\t')
                keyset = list(vec[j].keys())
                keyset = sorted(keyset)
                if len(keyset) < 1:
                    output_data.write('0:1')
                for key in keyset:
                    output_data.write(str(key))
                    output_data.write(':')
                    output_data.write(str(vec[j][key]))
                    output_data.write(' ')
                output_data.write("\n")
        output_data.close()
        print "\nMaking training data finished."
        return filecount

    def classifiy_score(self, featureVec):
        return self.perceptron.scoreout(featureVec)
        # return self.perceptron.probout(featureVec)

    def getEmitPrb(self, score):
        """
        Get emits_prb use softmax function
        """
        max_score = max(score.values())
        emit_prb = {}
        expsum = 0.
        for key in score.keys():
            emit_prb[key] = math.exp(score[key] - max_score)
            expsum += emit_prb[key]
        for key in score.keys():
            emit_prb[key] /= expsum
            emit_prb[key] = math.log(emit_prb[key])
        return emit_prb

    def ViterbiDecode(self, sentence):
        N = len(sentence)  # length of the sentence
        prb = 0.
        prb_max = 0.
        toward = list()
        back = list()

        # get the feature Vector of every single character
        features = self.GetFeature(sentence)
        vec = self.Feature2Vec(features)

        for i in range(N):
            toward.append({})
            back.append({})
            for j in self.state:
                toward[i][j] = float('-inf')
                back[i][j] = ' '

        # run viterbi
        score = self.classifiy_score(vec[0])
        emit_prb = self.getEmitPrb(score)
        # print emit_prb
        for s in self.state:
            toward[0][s] = self.init_prb[s] + emit_prb[s]
            back[0][s] = 'end'
        # toward algorithm
        for t in range(1, N):
            score = self.classifiy_score(vec[t])
            # print score
            emit_prb = self.getEmitPrb(score)
            for s in self.state:
                prb = float('-inf')
                prb_max = float('-inf')
                state_max = 'S'
                for i in self.state:
                    prb = toward[t - 1][i] + self.trans_prb[i][s] + emit_prb[s]
                    if prb > prb_max:
                        prb_max = prb
                        state_max = i
                toward[t][s] = prb_max
                back[t][s] = state_max
        # backward algorithm to get the best tag sequence
        index = N - 1
        taglist = []
        prb_max = float('-inf')
        state_max = ''
        for s in self.state:
            prb = toward[N - 1][s]
            if prb > prb_max:
                prb_max = prb
                state_max = s
        taglist.append(state_max)
        while index >= 1:
            pre_state = back[index][taglist[0]]
            taglist.insert(0, pre_state)
            index -= 1
        if taglist[-1] == 'B':
            taglist[-1] = 'S'
        elif taglist[-1] == 'M':
            taglist[-1] == 'E'
        return taglist

    def GetFeature(self, sent):
        """
        get feature for every single character
        return a list of features
        """
        features = []

        for i in range(len(sent)):
            left2 = sent[i - 2] if i - 2 >= 0 else '#'
            left1 = sent[i - 1] if i - 1 >= 0 else '#'
            mid = sent[i]
            right1 = sent[i + 1] if i + 1 < len(sent) else '#'
            right2 = sent[i + 2] if i + 2 < len(sent) else '#'
            # print self.dict.dic.has_key(mid),
            if self.dict.dic.has_key(mid):
                MWL = str(self.dict.dic[mid][0])
                t0 = self.dict.dic[mid][1]
                # print MWL,t0
            else:
                MWL = '0'
                t0 = '#'
            # print MWL,t0
            featcode = 0
            chars = [left2, left1, mid, right1, right2]
            for i in range(len(chars)):
                if chars[i].encode('utf-8') in self.nums:
                    featcode += 0
                elif chars[i] in self.dates:
                    featcode += 5**i
                elif (u"a" <= chars[i] and chars[i] <= u"z") or (u"A" <= chars[i] and chars[i] <= u"Z"):
                    featcode += 5**i * 2
                elif chars[i].encode('utf-8') in self.names:
                    featcode += 5**i * 3
                else:
                    featcode += 5**i * 4
            featcode += 1
            feat = [left2, left1, mid, right1, right2, left2 + left1, left1 + mid, mid + right1,
                    right1 + right2, left1 + right1, MWL + t0, left1 + t0, mid + t0, right1 + t0, featcode]
            features.append(feat)

        return features

    def Feature2Vec(self, feats):
        """
        get feature vector from feature
        the paramters feats mean is a list of features of every character
        """
        punctuation = [u'。', u'，', u'？', u'！', u'、', u'；', u'：', u'「', '」',
                       u'『', u'』', u'‘', u'’', u'“', u'”', u'（', u'）', u'〔',
                       u'〕', u'【', u'】', u'——', u'–', u'…', u'．', u'·', u'《',
                       u'》', u'〈', u'〉']
        featVecs = []
        for feat in feats:
            featVec = {}
            # if feat[2] in punctuation:
            #     featVec[0] = 1
            for it in range(len(feat)):
                if it < 5:
                    if self.unigram_feat_id.has_key(feat[it]):
                        key = self.unigram_feat_id[feat[it]]+self.unigram_feat_num*it
                        featVec[key] = 1
                elif it < 10:
                    if self.bigram_feat_id.has_key(feat[it]):
                        key = self.bigram_feat_id[feat[it]]
                        key += self.unigram_feat_num*5 + self.bigram_feat_num*(it-5)
                        featVec[key] = 1
                elif it < 14:
                    if self.dict_feat_id.has_key(feat[it]):
                        key = self.dict_feat_id[feat[it]]
                        key += self.unigram_feat_num*5 + self.bigram_feat_num*5 + self.dict_feat_num*(it-10)
                        featVec[key] = 1
                else:
                    key = feat[it]
                    key += self.unigram_feat_num*5 + self.bigram_feat_num*5 + self.dict_feat_num*4
                    featVec[key] = 1
            featVecs.append(featVec)

        return featVecs

    def getTag(self, wordlist):
        """get the tag for every char in the word"""
        taglist = []
        for word in wordlist:
            if len(word) == 1:
                taglist.append('S')
            else:
                taglist.append('B')
                for w in word[1:len(word) - 1]:
                    taglist.append('M')
                taglist.append('E')
        return taglist

    def tag2word(self, sentence, taglist):
        wordlist = []
        tmp = ''
        for i in range(len(taglist)):
            if taglist[i] == 'S':
                tmp = sentence[i]
                wordlist.append(tmp)
                tmp = ''
            elif taglist[i] == 'B':
                tmp += sentence[i]
            elif taglist[i] == 'M':
                tmp += sentence[i]
            else:
                tmp += sentence[i]
                wordlist.append(tmp)
                tmp = ''
        return wordlist

    def loadTestCorpus(self, corpus_file):
        print "Loading Test Corpus data",
        input_data = codecs.open(corpus_file, 'r', 'utf-8')
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == '':
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print '.',
            wordlist = rawText.split()
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
        print "\nLoading Test Corpus done."

    def loadCorpus(self, corpus_file):
        print "Loading Corpus data",
        input_data = codecs.open(corpus_file, 'r', 'utf-8')
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == '':
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print '.',
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
        print "\nLoading Corpus done."

    def pretreatment(self, train_file):
        print "The process of corpus Pretreatment",
        input_data = codecs.open(train_file, 'r', 'utf-8')
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == '':
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print '.',
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
            self.init_prb[taglist[0]] += 1
            for t in range(1, len(taglist)):
                self.trans_prb[taglist[t - 1]][taglist[t]] += 1

            feats = self.GetFeature(sentence)
            # record the feats, allocate the id of feature
            for feat in feats:
                for it in range(len(feat)):
                    if it < 5:  # unigram feature
                        if not self.unigram_feat_id.has_key(feat[it]):
                            self.unigram_feat_num += 1
                            self.unigram_feat_id[
                                feat[it]] = self.unigram_feat_num
                    elif it < 10:  # bigram feature
                        if not self.bigram_feat_id.has_key(feat[it]):
                            self.bigram_feat_num += 1
                            self.bigram_feat_id[
                                feat[it]] = self.bigram_feat_num
                    elif it < 14:  # dictionary information feature
                        if not self.dict_feat_id.has_key(feat[it]):
                            self.dict_feat_num += 1
                            self.dict_feat_id[feat[it]] = self.dict_feat_num

        # calculate the probability of tag
        initsum = sum(self.init_prb.values())
        for key in self.init_prb.keys():
            self.init_prb[key] = float(self.init_prb[key]) / initsum
        for x in self.trans_prb.keys():
            tmpsum = sum(self.trans_prb[x].values())
            for y in self.trans_prb[x].keys():
                self.trans_prb[x][y] = float(self.trans_prb[x][y]) / tmpsum
        self.dimension = self.unigram_feat_num * 5 + \
            self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num
        # calc the log probability
        for s in self.state:
            if self.init_prb[s] != 0.:
                self.init_prb[s] = math.log(self.init_prb[s])
            else:
                self.init_prb[s] = float('-inf')
            for j in self.state:
                if self.trans_prb[s][j] != 0.:
                    self.trans_prb[s][j] = math.log(self.trans_prb[s][j])
                else:
                    self.trans_prb[s][j] = float('-inf')
        print "\nProcess of pretreatment finished."

Example #7

Show file

def main():
    iris = datasets.load_iris()
    irisData = iris.data[:, [2, 3]]
    irisClass = iris.target
    dataTrainingSet, dataTestSet, classTrainingSet, classTestSet = train_test_split(
        irisData, irisClass, test_size=0.3, random_state=1, stratify=irisClass)
    #     =============== Perceptron ====================
    # Perceptron 1
    classTrainingSubset1 = np.copy(classTrainingSet)
    classTrainingSubset1 = classTrainingSubset1[(classTrainingSubset1 != 2)]
    dataTrainingSubset1 = np.copy(dataTrainingSet)
    dataTrainingSubset1 = dataTrainingSubset1[(classTrainingSet != 2)]

    classTrainingSubset1[(classTrainingSubset1 != 0)] = -1
    classTrainingSubset1[(classTrainingSubset1 != -1)] = 1
    perceptron1 = Perceptron(learningRate=0.1, iterationsToStop=10)
    perceptron1.learn(dataTrainingSubset1, classTrainingSubset1)

    # Perceptron 2
    classTrainingSubset2 = np.copy(classTrainingSet)
    classTrainingSubset2 = classTrainingSubset2[(classTrainingSubset2 != 1)]
    dataTrainingSubset2 = np.copy(dataTrainingSet)
    dataTrainingSubset2 = dataTrainingSubset2[(classTrainingSet != 1)]

    classTrainingSubset2[(classTrainingSubset2 != 2)] = -1
    classTrainingSubset2[(classTrainingSubset2 != -1)] = 1

    perceptron2 = Perceptron(learningRate=0.1, iterationsToStop=10)
    perceptron2.learn(dataTrainingSubset2, classTrainingSubset2)

    # Perceptron 3
    classTrainingSubset3 = np.copy(classTrainingSet)
    classTrainingSubset3 = classTrainingSubset3[(classTrainingSubset3 != 0)]
    dataTrainingSubset3 = np.copy(dataTrainingSet)
    dataTrainingSubset3 = dataTrainingSubset3[(classTrainingSet != 0)]

    classTrainingSubset3[(classTrainingSubset3 != 1)] = -1

    perceptron3 = Perceptron(learningRate=0.35, iterationsToStop=850)
    perceptron3.learn(dataTrainingSubset3, classTrainingSubset3)

    multiPerceptron = MultiPerceptron(perceptron1, perceptron2, perceptron3)

    plot_decision_regions(X=dataTestSet,
                          y=classTestSet,
                          classifier=multiPerceptron)
    plt.xlabel(r'$x_1$')
    plt.ylabel(r'$x_2$')
    plt.title('Perceptron')
    plt.legend(loc='upper left')
    plt.show()

    #     =============== Logistic regression ====================

    classTrainingSubset1[(classTrainingSubset1 != 1)] = 0
    logisticRegression1 = LogisticRegression(learningRate=0.05,
                                             iterationsToStop=1000,
                                             random_state=1)
    logisticRegression1.learn(dataTrainingSubset1, classTrainingSubset1)
    logisticRegression1.printProbability(dataTrainingSubset1)

    classTrainingSubset2[(classTrainingSubset2 != 1)] = 0
    logisticRegression2 = LogisticRegression(learningRate=0.05,
                                             iterationsToStop=1000,
                                             random_state=1)
    logisticRegression2.learn(dataTrainingSubset2, classTrainingSubset2)
    logisticRegression2.printProbability(dataTrainingSubset2)

    classTrainingSubset3[(classTrainingSubset3 != 1)] = 0
    logisticRegression3 = LogisticRegression(learningRate=0.15,
                                             iterationsToStop=1500,
                                             random_state=1)
    logisticRegression3.learn(dataTrainingSubset3, classTrainingSubset3)
    logisticRegression3.printProbability(dataTrainingSubset3)

    multiLogisticRegression = MultiLogisticRegression(logisticRegression1,
                                                      logisticRegression2,
                                                      logisticRegression3)

    plot_decision_regions(X=dataTestSet,
                          y=classTestSet,
                          classifier=multiLogisticRegression)
    plt.xlabel(r'$x_1$')
    plt.ylabel(r'$x_2$')
    plt.title('Logistic regression')
    plt.legend(loc='lower right')
    plt.show()