Example #1
0
    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()  # the tag of corpus
        self.corpus_num = 0
        self.state = ['B', 'M', 'E', 'S']
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        self.trans_prb = {
            'B': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'M': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'E': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'S': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            }
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5**5
        self.path = r'./'

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r'./resources/Chinese_num.txt', 'r')
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r'./resources/names.txt', 'r')
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()
Example #2
0
    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()     # the tag of corpus
        self.corpus_num = 0
        self.state = ['B', 'M', 'E', 'S']
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        self.trans_prb = {
            'B': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'M': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'E': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'S': {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.trigram_feat_num = 0
        self.trigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5**5
        self.path = r'./'

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r'./resources/Chinese_num.txt', 'r')
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r'./resources/names.txt', 'r')
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()
Example #3
0
    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()  # the tag of corpus
        self.corpus_num = 0
        self.state = ["B", "M", "E", "S"]
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {"B": 0, "M": 0, "E": 0, "S": 0}
        self.trans_prb = {
            "B": {"B": 0, "M": 0, "E": 0, "S": 0},
            "M": {"B": 0, "M": 0, "E": 0, "S": 0},
            "E": {"B": 0, "M": 0, "E": 0, "S": 0},
            "S": {"B": 0, "M": 0, "E": 0, "S": 0},
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5 ** 5
        self.path = r"./"

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r"./resources/Chinese_num.txt", "r")
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r"./resources/names.txt", "r")
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()
Example #4
0
class CWSPerceptron:
    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()  # the tag of corpus
        self.corpus_num = 0
        self.state = ['B', 'M', 'E', 'S']
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        self.trans_prb = {
            'B': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'M': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'E': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            },
            'S': {
                'B': 0,
                'M': 0,
                'E': 0,
                'S': 0
            }
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.trigram_feat_num = 0
        self.trigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5**5
        self.path = r'./'

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r'./resources/Chinese_num.txt', 'r')
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r'./resources/names.txt', 'r')
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()

    def setSavePath(self, path):
        self.path = path
        self.perceptron.setSavePath(path)

    def saveModel(self):
        print "Saving the unigram&bigram infomation......"
        output1 = open(self.path + r"bigram_feat_id.pkl", 'wb')
        dump(self.bigram_feat_id, output1, -1)
        output1.close()
        output2 = open(self.path + r"unigram_feat_id.pkl", 'wb')
        dump(self.unigram_feat_id, output2, -1)
        output2.close()
        output3 = open(self.path + r"trigram_feat_id.pkl", 'wb')
        dump(self.trigram_feat_id, output3, -1)
        output3.close()
        output4 = open(self.path + r"dict_feat_id.pkl", 'wb')
        dump(self.dict_feat_id, output4, -1)
        output4.close()

        # release the memory
        self.unigram_feat_id = []
        self.bigram_feat_id = []
        self.trigram_feat_id = []
        self.corpus = []
        self.tag = []
        print "Saving the inital prb & trans prb infomation....."
        output1 = open(self.path + r"init_prb.pkl", 'wb')
        dump(self.init_prb, output1, -1)
        output1.close()
        output2 = open(self.path + r"trans_prb.pkl", 'wb')
        dump(self.trans_prb, output2, -1)
        output2.close()
        print "Saving process done."

    def loadModel(self):
        print "Loading the unigram&bigram infomation......"
        inputs = open(self.path + r"bigram_feat_id.pkl", 'rb')
        self.bigram_feat_id = load(inputs)
        self.bigram_feat_num = len(self.bigram_feat_id)
        inputs.close()

        inputs1 = open(self.path + r"unigram_feat_id.pkl", 'rb')
        self.unigram_feat_id = load(inputs1)
        self.unigram_feat_num = len(self.unigram_feat_id)
        inputs1.close()

        inputs2 = open(self.path + r"dict_feat_id.pkl", 'rb')
        self.dict_feat_id = load(inputs2)
        self.dict_feat_num = len(self.dict_feat_id)
        inputs2.close()

        inputs3 = open(self.path + r"trigram_feat_id.pkl", 'rb')
        self.trigram_feat_id = load(inputs3)
        self.trigram_feat_num = len(self.trigram_feat_id)
        inputs3.close()
        # print "Loading process done."
        print "Loading the prb infomation......"
        inputs = open(self.path + r"init_prb.pkl", 'rb')
        self.init_prb = load(inputs)
        inputs.close()
        inputs1 = open(self.path + r"trans_prb.pkl", 'rb')
        self.trans_prb = load(inputs1)
        inputs1.close()
        print "Loading process done."
        self.dimension = self.unigram_feat_num * 5 + self.bigram_feat_num * 4\
            + self.trigram_feat_num + self.dict_feat_num * 4 + self.type_feat_num

    def loadDict(self, dictfile):
        self.dict.loadDict(dictfile)

    def saveDict(self, outfile):
        self.dict.saveDict(outfile)

    def readDict(self, dictfile):
        self.dict.readDict(dictfile)

    def appendDict(self, dictfile):
        self.dict.appendDict(dictfile)

    def segmentation(self, outfile):
        output = codecs.open(outfile, 'w', 'utf-8')
        start = time.clock()
        for i in range(self.corpus_num):
            taglist = self.ViterbiDecode(self.corpus[i])
            wordlist = self.tag2word(self.corpus[i], taglist)
            for j in range(len(wordlist)):
                output.write(wordlist[j])
                output.write(' ')
            output.write("\n")
        print "Decode:", time.clock() - start
        output.close()

    def train(self,
              trainfile,
              batch_num=100,
              max_iter=200,
              learn_rate=1.0,
              delta_thrd=0.001,
              is_average=True):
        # self.makelibsvmdata(r'train.data',max_corpus)
        print "Start training process."
        self.perceptron.loadFeatSize(self.dimension, len(self.state))
        self.perceptron.read_train_file(trainfile)
        self.perceptron.printinfo()
        self.perceptron.train_sgd(max_iter, learn_rate, delta_thrd, is_average)
        self.perceptron.saveModel()
        print "Training process done."
        print "Multi-class Perceptron Model had been saved."

    def printstr(self, wordlist):
        for item in wordlist:
            print item
        print " "

    def makeLibSvmData(self, output_file, corpus_num=-1):
        print "Making training data.",
        filecount = 1
        output_data = codecs.open(output_file, 'w')
        if corpus_num == -1:
            corpus_num = self.corpus_num
        for i in range(corpus_num):
            taglist = self.tag[i]
            features = self.GetFeature(self.corpus[i])
            vec = self.Feature2Vec(features)
            for j in range(len(taglist)):
                output_data.write(str(self.state.index(taglist[j])))
                output_data.write('\t')
                keyset = list(vec[j].keys())
                keyset = sorted(keyset)
                if len(keyset) < 1:
                    output_data.write('0:1')
                for key in keyset:
                    output_data.write(str(key))
                    output_data.write(':')
                    output_data.write(str(vec[j][key]))
                    output_data.write(' ')
                output_data.write("\n")
        output_data.close()
        print "\nMaking training data finished."
        return filecount

    def classifiy_score(self, featureVec):
        tmp = self.perceptron.scoreout(featureVec)
        ans = {}
        for key in tmp.keys():
            ans[self.state[int(key)]] = tmp[key]
        # return self.perceptron.scoreout(featureVec)
        return ans
        # return self.perceptron.probout(featureVec)

    def getEmitPrb(self, score):
        """
        Get emits_prb use softmax function
        """
        max_score = max(score.values())
        emit_prb = {}
        expsum = 0.
        for key in score.keys():
            emit_prb[key] = math.exp(score[key] - max_score)
            expsum += emit_prb[key]
        for key in score.keys():
            emit_prb[key] /= expsum
            emit_prb[key] = math.log(emit_prb[key])
        return emit_prb

    def ViterbiDecode(self, sentence):
        N = len(sentence)  # length of the sentence
        prb = 0.
        prb_max = 0.
        toward = list()
        back = list()

        # get the feature Vector of every single character
        features = self.GetFeature(sentence)
        vec = self.Feature2Vec(features)

        for i in range(N):
            toward.append({})
            back.append({})
            for j in self.state:
                toward[i][j] = float('-inf')
                back[i][j] = ' '

        # run viterbi
        score = self.classifiy_score(vec[0])
        emit_prb = self.getEmitPrb(score)
        # print emit_prb
        for s in self.state:
            toward[0][s] = self.init_prb[s] + emit_prb[s]
            back[0][s] = 'end'
        # toward algorithm
        for t in range(1, N):
            score = self.classifiy_score(vec[t])
            # print score
            emit_prb = self.getEmitPrb(score)
            for s in self.state:
                prb = float('-inf')
                prb_max = float('-inf')
                state_max = 'S'
                for i in self.state:
                    prb = toward[t - 1][i] + self.trans_prb[i][s] + emit_prb[s]
                    if prb > prb_max:
                        prb_max = prb
                        state_max = i
                toward[t][s] = prb_max
                back[t][s] = state_max
        # backward algorithm to get the best tag sequence
        index = N - 1
        taglist = []
        prb_max = float('-inf')
        state_max = ''
        for s in self.state:
            prb = toward[N - 1][s]
            if prb > prb_max:
                prb_max = prb
                state_max = s
        taglist.append(state_max)
        while index >= 1:
            pre_state = back[index][taglist[0]]
            taglist.insert(0, pre_state)
            index -= 1
        if taglist[-1] == 'B':
            taglist[-1] = 'S'
        elif taglist[-1] == 'M':
            taglist[-1] == 'E'
        return taglist

    def GetFeature(self, sent):
        """
        get feature for every single character
        return a list of features
        """
        features = []

        for i in range(len(sent)):
            left2 = sent[i - 2] if i - 2 >= 0 else '#'
            left1 = sent[i - 1] if i - 1 >= 0 else '#'
            mid = sent[i]
            right1 = sent[i + 1] if i + 1 < len(sent) else '#'
            right2 = sent[i + 2] if i + 2 < len(sent) else '#'
            # print self.dict.dic.has_key(mid),
            if self.dict.dic.has_key(mid):
                MWL = str(self.dict.dic[mid][0])
                t0 = self.dict.dic[mid][1]
                # print MWL,t0
            else:
                MWL = '0'
                t0 = '#'
            # print MWL,t0
            featcode = 0
            chars = [left2, left1, mid, right1, right2]
            for i in range(len(chars)):
                if chars[i].encode('utf-8') in self.nums:
                    featcode += 0
                elif chars[i] in self.dates:
                    featcode += 5**i
                elif (u"a" <= chars[i]
                      and chars[i] <= u"z") or (u"A" <= chars[i]
                                                and chars[i] <= u"Z"):
                    featcode += 5**i * 2
                elif chars[i].encode('utf-8') in self.names:
                    featcode += 5**i * 3
                else:
                    featcode += 5**i * 4
            featcode += 1
            feat = [
                left2, left1, mid, right1, right2, left2 + left1, left1 + mid,
                mid + right1, right1 + right2, left1 + right1, MWL + t0,
                left1 + t0, mid + t0, right1 + t0, featcode
            ]
            features.append(feat)

        return features

    def Feature2Vec(self, feats):
        """
        get feature vector from feature
        the paramters feats mean is a list of features of every character
        """
        punctuation = [
            u'。', u',', u'?', u'!', u'、', u';', u':', u'「', '」', u'『', u'』',
            u'‘', u'’', u'“', u'”', u'(', u')', u'〔', u'〕', u'【', u'】', u'——',
            u'–', u'…', u'.', u'·', u'《', u'》', u'〈', u'〉'
        ]
        featVecs = []
        for feat in feats:
            featVec = {}
            # if feat[2] in punctuation:
            #     featVec[0] = 1
            for it in range(len(feat)):
                if it < 5:
                    if self.unigram_feat_id.has_key(feat[it]):
                        key = self.unigram_feat_id[
                            feat[it]] + self.unigram_feat_num * it
                        featVec[key] = 1
                elif it < 9:
                    if self.bigram_feat_id.has_key(feat[it]):
                        key = self.bigram_feat_id[feat[it]]
                        key += self.unigram_feat_num * 5 + \
                            self.bigram_feat_num * (it - 4)
                        featVec[key] = 1
                elif it < 10:
                    if self.trigram_feat_id.has_key(feat[it]):
                        key = self.trigram_feat_id[feat[it]]
                        key += self.unigram_feat_num * 5 + self.bigram_feat_num * 4
                elif it < 14:
                    if self.dict_feat_id.has_key(feat[it]):
                        key = self.dict_feat_id[feat[it]]
                        key += self.unigram_feat_num * 5 + self.bigram_feat_num * 4 + self.trigram_feat_num + self.dict_feat_num * (
                            it - 10)
                        featVec[key] = 1
                else:
                    key = feat[it]
                    key += self.unigram_feat_num * 5 + self.bigram_feat_num * 4 + self.trigram_feat_num + self.dict_feat_num * 4
                    featVec[key] = 1
            featVecs.append(featVec)

        return featVecs

    def getTag(self, wordlist):
        """get the tag for every char in the word"""
        taglist = []
        for word in wordlist:
            if len(word) == 1:
                taglist.append('S')
            else:
                taglist.append('B')
                for w in word[1:len(word) - 1]:
                    taglist.append('M')
                taglist.append('E')
        return taglist

    def tag2word(self, sentence, taglist):
        wordlist = []
        tmp = ''
        for i in range(len(taglist)):
            if taglist[i] == 'S':
                tmp = sentence[i]
                wordlist.append(tmp)
                tmp = ''
            elif taglist[i] == 'B':
                tmp += sentence[i]
            elif taglist[i] == 'M':
                tmp += sentence[i]
            else:
                tmp += sentence[i]
                wordlist.append(tmp)
                tmp = ''
        return wordlist

    def loadCorpus(self, corpus_file):
        print "Loading Corpus data",
        input_data = codecs.open(corpus_file, 'r', 'utf-8')
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == '':
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print '.',
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
        print "\nLoading Corpus done."

    def pretreatment(self, train_file):
        print "The process of corpus Pretreatment",
        input_data = codecs.open(train_file, 'r', 'utf-8')
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == '':
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print '.',
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
            self.init_prb[taglist[0]] += 1
            for t in range(1, len(taglist)):
                self.trans_prb[taglist[t - 1]][taglist[t]] += 1

            feats = self.GetFeature(sentence)
            # record the feats, allocate the id of feature
            for feat in feats:
                for it in range(len(feat)):
                    if it < 5:  # unigram feature
                        if not self.unigram_feat_id.has_key(feat[it]):
                            self.unigram_feat_num += 1
                            self.unigram_feat_id[
                                feat[it]] = self.unigram_feat_num
                    elif it < 9:  # bigram feature
                        if not self.bigram_feat_id.has_key(feat[it]):
                            self.bigram_feat_num += 1
                            self.bigram_feat_id[
                                feat[it]] = self.bigram_feat_num
                    elif it < 10:  # trigram feature
                        if not self.trigram_feat_id.has_key(feat[it]):
                            self.trigram_feat_num += 1
                            self.trigram_feat_id[
                                feat[it]] = self.trigram_feat_num
                    elif it < 14:  # dictionary information feature
                        if not self.dict_feat_id.has_key(feat[it]):
                            self.dict_feat_num += 1
                            self.dict_feat_id[feat[it]] = self.dict_feat_num

        # calculate the probability of tag
        initsum = sum(self.init_prb.values())
        for key in self.init_prb.keys():
            self.init_prb[key] = float(self.init_prb[key]) / initsum
        for x in self.trans_prb.keys():
            tmpsum = sum(self.trans_prb[x].values())
            for y in self.trans_prb[x].keys():
                self.trans_prb[x][y] = float(self.trans_prb[x][y]) / tmpsum
        self.dimension = self.unigram_feat_num * 5 + \
            self.bigram_feat_num * 4 + self.trigram_feat_num + self.dict_feat_num * 4 + self.type_feat_num
        # calc the log probability
        for s in self.state:
            if self.init_prb[s] != 0.:
                self.init_prb[s] = math.log(self.init_prb[s])
            else:
                self.init_prb[s] = float('-inf')
            for j in self.state:
                if self.trans_prb[s][j] != 0.:
                    self.trans_prb[s][j] = math.log(self.trans_prb[s][j])
                else:
                    self.trans_prb[s][j] = float('-inf')
        print "\nProcess of pretreatment finished."
Example #5
0
class CWSPerceptron:
    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()  # the tag of corpus
        self.corpus_num = 0
        self.state = ["B", "M", "E", "S"]
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {"B": 0, "M": 0, "E": 0, "S": 0}
        self.trans_prb = {
            "B": {"B": 0, "M": 0, "E": 0, "S": 0},
            "M": {"B": 0, "M": 0, "E": 0, "S": 0},
            "E": {"B": 0, "M": 0, "E": 0, "S": 0},
            "S": {"B": 0, "M": 0, "E": 0, "S": 0},
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5 ** 5
        self.path = r"./"

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r"./resources/Chinese_num.txt", "r")
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r"./resources/names.txt", "r")
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()

    def setSavePath(self, path):
        self.path = path
        self.perceptron.setSavePath(path)

    def saveModel(self):
        print "Saving the unigram&bigram infomation......"
        output1 = open(self.path + r"bigram_feat_id.pkl", "wb")
        dump(self.bigram_feat_id, output1, -1)
        output1.close()
        output2 = open(self.path + r"unigram_feat_id.pkl", "wb")
        dump(self.unigram_feat_id, output2, -1)
        output2.close()
        output3 = open(self.path + r"dict_feat_id.pkl", "wb")
        dump(self.dict_feat_id, output3, -1)
        output3.close()

        # release the memory
        self.unigram_feat_id = []
        self.bigram_feat_id = []
        self.corpus = []
        self.tag = []
        print "Saving the inital prb & trans prb infomation....."
        output1 = open(self.path + r"init_prb.pkl", "wb")
        dump(self.init_prb, output1, -1)
        output1.close()
        output2 = open(self.path + r"trans_prb.pkl", "wb")
        dump(self.trans_prb, output2, -1)
        output2.close()
        print "Saving process done."

    def loadModel(self):
        print "Loading the unigram&bigram infomation......"
        inputs = open(self.path + r"bigram_feat_id.pkl", "rb")
        self.bigram_feat_id = load(inputs)
        self.bigram_feat_num = len(self.bigram_feat_id)
        inputs.close()
        inputs1 = open(self.path + r"unigram_feat_id.pkl", "rb")
        self.unigram_feat_id = load(inputs1)
        self.unigram_feat_num = len(self.unigram_feat_id)
        inputs1.close()
        inputs2 = open(self.path + r"dict_feat_id.pkl", "rb")
        self.dict_feat_id = load(inputs2)
        self.dict_feat_num = len(self.dict_feat_id)
        # print "Loading process done."
        print "Loading the prb infomation......"
        inputs = open(self.path + r"init_prb.pkl", "rb")
        self.init_prb = load(inputs)
        inputs.close()
        inputs1 = open(self.path + r"trans_prb.pkl", "rb")
        self.trans_prb = load(inputs1)
        inputs1.close()
        print "Loading process done."
        self.dimension = (
            self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num
        )

    def loadDict(self, dictfile):
        self.dict.loadDict(dictfile)

    def saveDict(self, outfile):
        self.dict.saveDict(outfile)

    def readDict(self, dictfile):
        self.dict.readDict(dictfile)

    def appendDict(self, dictfile):
        self.dict.appendDict(dictfile)

    def evaluate(self, corpus=200):
        error_count = 0
        tagnums = sum([len(item) for item in self.tag[0:corpus]])
        for i in range(corpus):
            tag = self.ViterbiDecode(self.corpus[i])
            # print 'y:',self.tag[i]
            # print 'p:',tag
            for index in range(len(tag)):
                pre = tag[index]
                # print self.tag[j]
                real = self.tag[i][index]
                # print pre, real
                if pre != real:
                    error_count += 1
        return 1 - float(error_count) / tagnums

    def segmentation(self, outfile):
        output = codecs.open(outfile, "w", "utf-8")
        start = time.clock()
        for i in range(self.corpus_num):
            taglist = self.ViterbiDecode(self.corpus[i])
            wordlist = self.tag2word(self.corpus[i], taglist)
            for j in range(len(wordlist)):
                output.write(wordlist[j])
                output.write(" ")
            output.write("\n")
        print "Decode:", time.clock() - start
        output.close()

    def train(self, trainfile, batch_num=100, max_iter=200, learn_rate=1.0, delta_thrd=0.001, is_average=True):
        # self.makelibsvmdata(r'train.data',max_corpus)
        print "Start training process."
        self.perceptron.loadFeatSize(self.dimension, len(self.state))
        self.perceptron.read_train_file(trainfile)
        self.perceptron.printinfo()
        self.perceptron.train_sgd(max_iter, learn_rate, delta_thrd, is_average)
        self.perceptron.saveModel()
        print "Training process done."
        print "Multi-class Perceptron Model had been saved."

    def printstr(self, wordlist):
        for item in wordlist:
            print item
        print " "

    def makeLibSvmData(self, output_file, corpus_num=-1):
        print "Making training data.",
        filecount = 1
        output_data = codecs.open(output_file, "w")
        if corpus_num == -1:
            corpus_num = self.corpus_num
        for i in range(corpus_num):
            taglist = self.tag[i]
            features = self.GetFeature(self.corpus[i])
            vec = self.Feature2Vec(features)
            for j in range(len(taglist)):
                output_data.write(str(self.state.index(taglist[j])))
                output_data.write("\t")
                keyset = list(vec[j].keys())
                keyset = sorted(keyset)
                if len(keyset) < 1:
                    output_data.write("0:1")
                for key in keyset:
                    output_data.write(str(key))
                    output_data.write(":")
                    output_data.write(str(vec[j][key]))
                    output_data.write(" ")
                output_data.write("\n")
        output_data.close()
        print "\nMaking training data finished."
        return filecount

    def classifiy_score(self, featureVec):
        tmp = self.perceptron.scoreout(featureVec)
        ans = {}
        for key in tmp.keys():
            ans[self.state[int(key)]] = tmp[key]
        # return self.perceptron.scoreout(featureVec)
        return ans
        # return self.perceptron.probout(featureVec)

    def getEmitPrb(self, score):
        """
        Get emits_prb use softmax function
        """
        max_score = max(score.values())
        emit_prb = {}
        expsum = 0.0
        for key in score.keys():
            emit_prb[key] = math.exp(score[key] - max_score)
            expsum += emit_prb[key]
        for key in score.keys():
            emit_prb[key] /= expsum
            emit_prb[key] = math.log(emit_prb[key])
        return emit_prb

    def ViterbiDecode(self, sentence):
        N = len(sentence)  # length of the sentence
        prb = 0.0
        prb_max = 0.0
        toward = list()
        back = list()

        # get the feature Vector of every single character
        features = self.GetFeature(sentence)
        vec = self.Feature2Vec(features)

        for i in range(N):
            toward.append({})
            back.append({})
            for j in self.state:
                toward[i][j] = float("-inf")
                back[i][j] = " "

        # run viterbi
        score = self.classifiy_score(vec[0])
        emit_prb = self.getEmitPrb(score)
        # print emit_prb
        for s in self.state:
            toward[0][s] = self.init_prb[s] + emit_prb[s]
            back[0][s] = "end"
        # toward algorithm
        for t in range(1, N):
            score = self.classifiy_score(vec[t])
            # print score
            emit_prb = self.getEmitPrb(score)
            for s in self.state:
                prb = float("-inf")
                prb_max = float("-inf")
                state_max = "S"
                for i in self.state:
                    prb = toward[t - 1][i] + self.trans_prb[i][s] + emit_prb[s]
                    if prb > prb_max:
                        prb_max = prb
                        state_max = i
                toward[t][s] = prb_max
                back[t][s] = state_max
        # backward algorithm to get the best tag sequence
        index = N - 1
        taglist = []
        prb_max = float("-inf")
        state_max = ""
        for s in self.state:
            prb = toward[N - 1][s]
            if prb > prb_max:
                prb_max = prb
                state_max = s
        taglist.append(state_max)
        while index >= 1:
            pre_state = back[index][taglist[0]]
            taglist.insert(0, pre_state)
            index -= 1
        if taglist[-1] == "B":
            taglist[-1] = "S"
        elif taglist[-1] == "M":
            taglist[-1] == "E"
        return taglist

    def GetFeature(self, sent):
        """
        get feature for every single character
        return a list of features
        """
        features = []

        for i in range(len(sent)):
            left2 = sent[i - 2] if i - 2 >= 0 else "#"
            left1 = sent[i - 1] if i - 1 >= 0 else "#"
            mid = sent[i]
            right1 = sent[i + 1] if i + 1 < len(sent) else "#"
            right2 = sent[i + 2] if i + 2 < len(sent) else "#"
            # print self.dict.dic.has_key(mid),
            if self.dict.dic.has_key(mid):
                MWL = str(self.dict.dic[mid][0])
                t0 = self.dict.dic[mid][1]
                # print MWL,t0
            else:
                MWL = "0"
                t0 = "#"
            # print MWL,t0
            featcode = 0
            chars = [left2, left1, mid, right1, right2]
            for i in range(len(chars)):
                if chars[i].encode("utf-8") in self.nums:
                    featcode += 0
                elif chars[i] in self.dates:
                    featcode += 5 ** i
                elif (u"a" <= chars[i] and chars[i] <= u"z") or (u"A" <= chars[i] and chars[i] <= u"Z"):
                    featcode += 5 ** i * 2
                elif chars[i].encode("utf-8") in self.names:
                    featcode += 5 ** i * 3
                else:
                    featcode += 5 ** i * 4
            featcode += 1
            feat = [
                left2,
                left1,
                mid,
                right1,
                right2,
                left2 + left1,
                left1 + mid,
                mid + right1,
                right1 + right2,
                left1 + right1,
                MWL + t0,
                left1 + t0,
                mid + t0,
                right1 + t0,
                featcode,
            ]
            features.append(feat)

        return features

    def Feature2Vec(self, feats):
        """
        get feature vector from feature
        the paramters feats mean is a list of features of every character
        """
        punctuation = [
            u"。",
            u",",
            u"?",
            u"!",
            u"、",
            u";",
            u":",
            u"「",
            "」",
            u"『",
            u"』",
            u"‘",
            u"’",
            u"“",
            u"”",
            u"(",
            u")",
            u"〔",
            u"〕",
            u"【",
            u"】",
            u"——",
            u"–",
            u"…",
            u".",
            u"·",
            u"《",
            u"》",
            u"〈",
            u"〉",
        ]
        featVecs = []
        for feat in feats:
            featVec = {}
            # if feat[2] in punctuation:
            #     featVec[0] = 1
            for it in range(len(feat)):
                if it < 5:
                    if self.unigram_feat_id.has_key(feat[it]):
                        key = self.unigram_feat_id[feat[it]] + self.unigram_feat_num * it
                        featVec[key] = 1
                elif it < 10:
                    if self.bigram_feat_id.has_key(feat[it]):
                        key = self.bigram_feat_id[feat[it]]
                        key += self.unigram_feat_num * 5 + self.bigram_feat_num * (it - 5)
                        featVec[key] = 1
                elif it < 14:
                    if self.dict_feat_id.has_key(feat[it]):
                        key = self.dict_feat_id[feat[it]]
                        key += self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * (it - 10)
                        featVec[key] = 1
                else:
                    key = feat[it]
                    key += self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * 4
                    featVec[key] = 1
            featVecs.append(featVec)

        return featVecs

    def getTag(self, wordlist):
        """get the tag for every char in the word"""
        taglist = []
        for word in wordlist:
            if len(word) == 1:
                taglist.append("S")
            else:
                taglist.append("B")
                for w in word[1 : len(word) - 1]:
                    taglist.append("M")
                taglist.append("E")
        return taglist

    def tag2word(self, sentence, taglist):
        wordlist = []
        tmp = ""
        for i in range(len(taglist)):
            if taglist[i] == "S":
                tmp = sentence[i]
                wordlist.append(tmp)
                tmp = ""
            elif taglist[i] == "B":
                tmp += sentence[i]
            elif taglist[i] == "M":
                tmp += sentence[i]
            else:
                tmp += sentence[i]
                wordlist.append(tmp)
                tmp = ""
        return wordlist

    def loadTestCorpus(self, corpus_file):
        print "Loading Test Corpus data",
        input_data = codecs.open(corpus_file, "r", "utf-8")
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == "":
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print ".",
            wordlist = rawText.split()
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
        print "\nLoading Test Corpus done."

    def loadCorpus(self, corpus_file):
        print "Loading Corpus data",
        input_data = codecs.open(corpus_file, "r", "utf-8")
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == "":
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print ".",
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
        print "\nLoading Corpus done."

    def pretreatment(self, train_file):
        print "The process of corpus Pretreatment",
        input_data = codecs.open(train_file, "r", "utf-8")
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == "":
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print ".",
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
            self.init_prb[taglist[0]] += 1
            for t in range(1, len(taglist)):
                self.trans_prb[taglist[t - 1]][taglist[t]] += 1

            feats = self.GetFeature(sentence)
            # record the feats, allocate the id of feature
            for feat in feats:
                for it in range(len(feat)):
                    if it < 5:  # unigram feature
                        if not self.unigram_feat_id.has_key(feat[it]):
                            self.unigram_feat_num += 1
                            self.unigram_feat_id[feat[it]] = self.unigram_feat_num
                    elif it < 10:  # bigram feature
                        if not self.bigram_feat_id.has_key(feat[it]):
                            self.bigram_feat_num += 1
                            self.bigram_feat_id[feat[it]] = self.bigram_feat_num
                    elif it < 14:  # dictionary information feature
                        if not self.dict_feat_id.has_key(feat[it]):
                            self.dict_feat_num += 1
                            self.dict_feat_id[feat[it]] = self.dict_feat_num

        # calculate the probability of tag
        initsum = sum(self.init_prb.values())
        for key in self.init_prb.keys():
            self.init_prb[key] = float(self.init_prb[key]) / initsum
        for x in self.trans_prb.keys():
            tmpsum = sum(self.trans_prb[x].values())
            for y in self.trans_prb[x].keys():
                self.trans_prb[x][y] = float(self.trans_prb[x][y]) / tmpsum
        self.dimension = (
            self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num
        )
        # calc the log probability
        for s in self.state:
            if self.init_prb[s] != 0.0:
                self.init_prb[s] = math.log(self.init_prb[s])
            else:
                self.init_prb[s] = float("-inf")
            for j in self.state:
                if self.trans_prb[s][j] != 0.0:
                    self.trans_prb[s][j] = math.log(self.trans_prb[s][j])
                else:
                    self.trans_prb[s][j] = float("-inf")
        print "\nProcess of pretreatment finished."
Example #6
0
class CWSPerceptron:

    def __init__(self):
        self.corpus = list()  # save the corpus for training
        self.tag = list()     # the tag of corpus
        self.corpus_num = 0
        self.state = ['B', 'M', 'E', 'S']
        self.perceptron = MP()
        self.dict = Dict()
        self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        self.trans_prb = {
            'B': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'M': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'E': {'B': 0, 'M': 0, 'E': 0, 'S': 0},
            'S': {'B': 0, 'M': 0, 'E': 0, 'S': 0}
        }
        self.dimension = 0
        self.unigram_feat_num = 0
        self.unigram_feat_id = {}
        self.bigram_feat_num = 0
        self.bigram_feat_id = {}
        self.dict_feat_num = 0
        self.dict_feat_id = {}
        self.type_feat_num = 5**5
        self.path = r'./'

        self.nums = []
        self.dates = [u"年", u"月", u"日"]
        self.names = []
        inputs1 = codecs.open(r'Chinese_num.txt', 'r')
        for line in inputs1.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.nums.append(w)
        inputs1.close()
        inputs2 = codecs.open(r'names.txt', 'r')
        for line in inputs2.readlines():
            rawText = line.strip().split()
            for w in rawText:
                self.names.append(w)
        inputs2.close()

    def setSavePath(self, path):
        self.path = path
        self.perceptron.setSavePath(path)

    def saveModel(self):
        print "Saving the unigram&bigram infomation......"
        output1 = open(self.path + r"bigram_feat_id.pkl", 'wb')
        dump(self.bigram_feat_id, output1, -1)
        output1.close()
        output2 = open(self.path + r"unigram_feat_id.pkl", 'wb')
        dump(self.unigram_feat_id, output2, -1)
        output2.close()
        output3 = open(self.path + r"dict_feat_id.pkl", 'wb')
        dump(self.dict_feat_id, output3, -1)
        output3.close()

        # release the memory
        self.unigram_feat_id = []
        self.bigram_feat_id = []
        self.corpus = []
        self.tag = []
        print "Saving the inital prb & trans prb infomation....."
        output1 = open(self.path + r"init_prb.pkl", 'wb')
        dump(self.init_prb, output1, -1)
        output1.close()
        output2 = open(self.path + r"trans_prb.pkl", 'wb')
        dump(self.trans_prb, output2, -1)
        output2.close()
        print "Saving process done."

    def loadModel(self):
        print "Loading the unigram&bigram infomation......"
        inputs = open(self.path + r"bigram_feat_id.pkl", 'rb')
        self.bigram_feat_id = load(inputs)
        self.bigram_feat_num = len(self.bigram_feat_id)
        inputs.close()
        inputs1 = open(self.path + r"unigram_feat_id.pkl", 'rb')
        self.unigram_feat_id = load(inputs1)
        self.unigram_feat_num = len(self.unigram_feat_id)
        inputs1.close()
        inputs2 = open(self.path + r"dict_feat_id.pkl", 'rb')
        self.dict_feat_id = load(inputs2)
        self.dict_feat_num = len(self.dict_feat_id)
        # print "Loading process done."
        print "Loading the prb infomation......"
        inputs = open(self.path + r"init_prb.pkl", 'rb')
        self.init_prb = load(inputs)
        inputs.close()
        inputs1 = open(self.path + r"trans_prb.pkl", 'rb')
        self.trans_prb = load(inputs1)
        inputs1.close()
        print "Loading process done."
        self.dimension = self.unigram_feat_num * 5 + \
            self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num

    def loadDict(self, dictfile):
        self.dict.loadDict(dictfile)

    def saveDict(self, outfile):
        self.dict.saveDict(outfile)

    def readDict(self, dictfile):
        self.dict.readDict(dictfile)

    def appendDict(self, dictfile):
        self.dict.appendDict(dictfile)

    def evaluate(self, corpus=200):
        error_count = 0
        tagnums = sum([len(item) for item in self.tag[0:corpus]])
        for i in range(corpus):
            tag = self.ViterbiDecode(self.corpus[i])
            # print 'y:',self.tag[i]
            # print 'p:',tag
            for index in range(len(tag)):
                pre = tag[index]
                # print self.tag[j]
                real = self.tag[i][index]
                # print pre, real
                if pre != real:
                    error_count += 1
        return 1 - float(error_count) / tagnums

    def segmentation(self, outfile):
        output = codecs.open(outfile, 'w', 'utf-8')
        start = time.clock()
        for i in range(self.corpus_num):
            taglist = self.ViterbiDecode(self.corpus[i])
            wordlist = self.tag2word(self.corpus[i], taglist)
            for j in range(len(wordlist)):
                output.write(wordlist[j])
                output.write(' ')
            output.write("\n")
        print "Decode:", time.clock() - start
        output.close()

    def train(self, trainfile, batch_num=100, max_iter=200, learn_rate=1.0,
              delta_thrd=0.001, is_average=True):
        # self.makelibsvmdata(r'train.data',max_corpus)
        print "Start training process."
        self.perceptron.loadFeatSize(self.dimension, len(self.state))
        self.perceptron.read_train_file(trainfile)
        self.perceptron.printinfo()
        self.perceptron.train_sgd(max_iter, learn_rate, delta_thrd, is_average)
        self.perceptron.saveModel()
        print "Training process done."
        print "Multi-class Perceptron Model had been saved."

    def printstr(self, wordlist):
        for item in wordlist:
            print item
        print " "

    def makeLibSvmData(self, output_file, corpus_num=-1):
        print "Making training data.",
        filecount = 1
        output_data = codecs.open(output_file, 'w')
        if corpus_num == -1:
            corpus_num = self.corpus_num
        for i in range(corpus_num):
            taglist = self.tag[i]
            features = self.GetFeature(self.corpus[i])
            vec = self.Feature2Vec(features)
            for j in range(len(taglist)):
                output_data.write(str(self.state.index(taglist[j])))
                output_data.write('\t')
                keyset = list(vec[j].keys())
                keyset = sorted(keyset)
                if len(keyset) < 1:
                    output_data.write('0:1')
                for key in keyset:
                    output_data.write(str(key))
                    output_data.write(':')
                    output_data.write(str(vec[j][key]))
                    output_data.write(' ')
                output_data.write("\n")
        output_data.close()
        print "\nMaking training data finished."
        return filecount

    def classifiy_score(self, featureVec):
        return self.perceptron.scoreout(featureVec)
        # return self.perceptron.probout(featureVec)

    def getEmitPrb(self, score):
        """
        Get emits_prb use softmax function
        """
        max_score = max(score.values())
        emit_prb = {}
        expsum = 0.
        for key in score.keys():
            emit_prb[key] = math.exp(score[key] - max_score)
            expsum += emit_prb[key]
        for key in score.keys():
            emit_prb[key] /= expsum
            emit_prb[key] = math.log(emit_prb[key])
        return emit_prb

    def ViterbiDecode(self, sentence):
        N = len(sentence)  # length of the sentence
        prb = 0.
        prb_max = 0.
        toward = list()
        back = list()

        # get the feature Vector of every single character
        features = self.GetFeature(sentence)
        vec = self.Feature2Vec(features)

        for i in range(N):
            toward.append({})
            back.append({})
            for j in self.state:
                toward[i][j] = float('-inf')
                back[i][j] = ' '

        # run viterbi
        score = self.classifiy_score(vec[0])
        emit_prb = self.getEmitPrb(score)
        # print emit_prb
        for s in self.state:
            toward[0][s] = self.init_prb[s] + emit_prb[s]
            back[0][s] = 'end'
        # toward algorithm
        for t in range(1, N):
            score = self.classifiy_score(vec[t])
            # print score
            emit_prb = self.getEmitPrb(score)
            for s in self.state:
                prb = float('-inf')
                prb_max = float('-inf')
                state_max = 'S'
                for i in self.state:
                    prb = toward[t - 1][i] + self.trans_prb[i][s] + emit_prb[s]
                    if prb > prb_max:
                        prb_max = prb
                        state_max = i
                toward[t][s] = prb_max
                back[t][s] = state_max
        # backward algorithm to get the best tag sequence
        index = N - 1
        taglist = []
        prb_max = float('-inf')
        state_max = ''
        for s in self.state:
            prb = toward[N - 1][s]
            if prb > prb_max:
                prb_max = prb
                state_max = s
        taglist.append(state_max)
        while index >= 1:
            pre_state = back[index][taglist[0]]
            taglist.insert(0, pre_state)
            index -= 1
        if taglist[-1] == 'B':
            taglist[-1] = 'S'
        elif taglist[-1] == 'M':
            taglist[-1] == 'E'
        return taglist

    def GetFeature(self, sent):
        """
        get feature for every single character
        return a list of features
        """
        features = []

        for i in range(len(sent)):
            left2 = sent[i - 2] if i - 2 >= 0 else '#'
            left1 = sent[i - 1] if i - 1 >= 0 else '#'
            mid = sent[i]
            right1 = sent[i + 1] if i + 1 < len(sent) else '#'
            right2 = sent[i + 2] if i + 2 < len(sent) else '#'
            # print self.dict.dic.has_key(mid),
            if self.dict.dic.has_key(mid):
                MWL = str(self.dict.dic[mid][0])
                t0 = self.dict.dic[mid][1]
                # print MWL,t0
            else:
                MWL = '0'
                t0 = '#'
            # print MWL,t0
            featcode = 0
            chars = [left2, left1, mid, right1, right2]
            for i in range(len(chars)):
                if chars[i].encode('utf-8') in self.nums:
                    featcode += 0
                elif chars[i] in self.dates:
                    featcode += 5**i
                elif (u"a" <= chars[i] and chars[i] <= u"z") or (u"A" <= chars[i] and chars[i] <= u"Z"):
                    featcode += 5**i * 2
                elif chars[i].encode('utf-8') in self.names:
                    featcode += 5**i * 3
                else:
                    featcode += 5**i * 4
            featcode += 1
            feat = [left2, left1, mid, right1, right2, left2 + left1, left1 + mid, mid + right1,
                    right1 + right2, left1 + right1, MWL + t0, left1 + t0, mid + t0, right1 + t0, featcode]
            features.append(feat)

        return features

    def Feature2Vec(self, feats):
        """
        get feature vector from feature
        the paramters feats mean is a list of features of every character
        """
        punctuation = [u'。', u',', u'?', u'!', u'、', u';', u':', u'「', '」',
                       u'『', u'』', u'‘', u'’', u'“', u'”', u'(', u')', u'〔',
                       u'〕', u'【', u'】', u'——', u'–', u'…', u'.', u'·', u'《',
                       u'》', u'〈', u'〉']
        featVecs = []
        for feat in feats:
            featVec = {}
            # if feat[2] in punctuation:
            #     featVec[0] = 1
            for it in range(len(feat)):
                if it < 5:
                    if self.unigram_feat_id.has_key(feat[it]):
                        key = self.unigram_feat_id[feat[it]]+self.unigram_feat_num*it
                        featVec[key] = 1
                elif it < 10:
                    if self.bigram_feat_id.has_key(feat[it]):
                        key = self.bigram_feat_id[feat[it]]
                        key += self.unigram_feat_num*5 + self.bigram_feat_num*(it-5)
                        featVec[key] = 1
                elif it < 14:
                    if self.dict_feat_id.has_key(feat[it]):
                        key = self.dict_feat_id[feat[it]]
                        key += self.unigram_feat_num*5 + self.bigram_feat_num*5 + self.dict_feat_num*(it-10)
                        featVec[key] = 1
                else:
                    key = feat[it]
                    key += self.unigram_feat_num*5 + self.bigram_feat_num*5 + self.dict_feat_num*4
                    featVec[key] = 1
            featVecs.append(featVec)

        return featVecs

    def getTag(self, wordlist):
        """get the tag for every char in the word"""
        taglist = []
        for word in wordlist:
            if len(word) == 1:
                taglist.append('S')
            else:
                taglist.append('B')
                for w in word[1:len(word) - 1]:
                    taglist.append('M')
                taglist.append('E')
        return taglist

    def tag2word(self, sentence, taglist):
        wordlist = []
        tmp = ''
        for i in range(len(taglist)):
            if taglist[i] == 'S':
                tmp = sentence[i]
                wordlist.append(tmp)
                tmp = ''
            elif taglist[i] == 'B':
                tmp += sentence[i]
            elif taglist[i] == 'M':
                tmp += sentence[i]
            else:
                tmp += sentence[i]
                wordlist.append(tmp)
                tmp = ''
        return wordlist

    def loadTestCorpus(self, corpus_file):
        print "Loading Test Corpus data",
        input_data = codecs.open(corpus_file, 'r', 'utf-8')
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == '':
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print '.',
            wordlist = rawText.split()
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
        print "\nLoading Test Corpus done."

    def loadCorpus(self, corpus_file):
        print "Loading Corpus data",
        input_data = codecs.open(corpus_file, 'r', 'utf-8')
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == '':
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print '.',
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
        print "\nLoading Corpus done."

    def pretreatment(self, train_file):
        print "The process of corpus Pretreatment",
        input_data = codecs.open(train_file, 'r', 'utf-8')
        for line in input_data.readlines():
            rawText = line.strip()
            if rawText == '':
                continue
            else:
                self.corpus_num += 1
            if self.corpus_num % 1000 == 0 and self.corpus_num != 0:
                print '.',
            wordlist = rawText.split()
            taglist = self.getTag(wordlist)
            self.tag.append(taglist)  # add to y, i.d. the tags list
            sentence = "".join(wordlist)
            self.corpus.append(sentence)  # add to x, i.d. the corpus
            self.init_prb[taglist[0]] += 1
            for t in range(1, len(taglist)):
                self.trans_prb[taglist[t - 1]][taglist[t]] += 1

            feats = self.GetFeature(sentence)
            # record the feats, allocate the id of feature
            for feat in feats:
                for it in range(len(feat)):
                    if it < 5:  # unigram feature
                        if not self.unigram_feat_id.has_key(feat[it]):
                            self.unigram_feat_num += 1
                            self.unigram_feat_id[
                                feat[it]] = self.unigram_feat_num
                    elif it < 10:  # bigram feature
                        if not self.bigram_feat_id.has_key(feat[it]):
                            self.bigram_feat_num += 1
                            self.bigram_feat_id[
                                feat[it]] = self.bigram_feat_num
                    elif it < 14:  # dictionary information feature
                        if not self.dict_feat_id.has_key(feat[it]):
                            self.dict_feat_num += 1
                            self.dict_feat_id[feat[it]] = self.dict_feat_num

        # calculate the probability of tag
        initsum = sum(self.init_prb.values())
        for key in self.init_prb.keys():
            self.init_prb[key] = float(self.init_prb[key]) / initsum
        for x in self.trans_prb.keys():
            tmpsum = sum(self.trans_prb[x].values())
            for y in self.trans_prb[x].keys():
                self.trans_prb[x][y] = float(self.trans_prb[x][y]) / tmpsum
        self.dimension = self.unigram_feat_num * 5 + \
            self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num
        # calc the log probability
        for s in self.state:
            if self.init_prb[s] != 0.:
                self.init_prb[s] = math.log(self.init_prb[s])
            else:
                self.init_prb[s] = float('-inf')
            for j in self.state:
                if self.trans_prb[s][j] != 0.:
                    self.trans_prb[s][j] = math.log(self.trans_prb[s][j])
                else:
                    self.trans_prb[s][j] = float('-inf')
        print "\nProcess of pretreatment finished."
Example #7
0
def main():
    iris = datasets.load_iris()
    irisData = iris.data[:, [2, 3]]
    irisClass = iris.target
    dataTrainingSet, dataTestSet, classTrainingSet, classTestSet = train_test_split(
        irisData, irisClass, test_size=0.3, random_state=1, stratify=irisClass)
    #     =============== Perceptron ====================
    # Perceptron 1
    classTrainingSubset1 = np.copy(classTrainingSet)
    classTrainingSubset1 = classTrainingSubset1[(classTrainingSubset1 != 2)]
    dataTrainingSubset1 = np.copy(dataTrainingSet)
    dataTrainingSubset1 = dataTrainingSubset1[(classTrainingSet != 2)]

    classTrainingSubset1[(classTrainingSubset1 != 0)] = -1
    classTrainingSubset1[(classTrainingSubset1 != -1)] = 1
    perceptron1 = Perceptron(learningRate=0.1, iterationsToStop=10)
    perceptron1.learn(dataTrainingSubset1, classTrainingSubset1)

    # Perceptron 2
    classTrainingSubset2 = np.copy(classTrainingSet)
    classTrainingSubset2 = classTrainingSubset2[(classTrainingSubset2 != 1)]
    dataTrainingSubset2 = np.copy(dataTrainingSet)
    dataTrainingSubset2 = dataTrainingSubset2[(classTrainingSet != 1)]

    classTrainingSubset2[(classTrainingSubset2 != 2)] = -1
    classTrainingSubset2[(classTrainingSubset2 != -1)] = 1

    perceptron2 = Perceptron(learningRate=0.1, iterationsToStop=10)
    perceptron2.learn(dataTrainingSubset2, classTrainingSubset2)

    # Perceptron 3
    classTrainingSubset3 = np.copy(classTrainingSet)
    classTrainingSubset3 = classTrainingSubset3[(classTrainingSubset3 != 0)]
    dataTrainingSubset3 = np.copy(dataTrainingSet)
    dataTrainingSubset3 = dataTrainingSubset3[(classTrainingSet != 0)]

    classTrainingSubset3[(classTrainingSubset3 != 1)] = -1

    perceptron3 = Perceptron(learningRate=0.35, iterationsToStop=850)
    perceptron3.learn(dataTrainingSubset3, classTrainingSubset3)

    multiPerceptron = MultiPerceptron(perceptron1, perceptron2, perceptron3)

    plot_decision_regions(X=dataTestSet,
                          y=classTestSet,
                          classifier=multiPerceptron)
    plt.xlabel(r'$x_1$')
    plt.ylabel(r'$x_2$')
    plt.title('Perceptron')
    plt.legend(loc='upper left')
    plt.show()

    #     =============== Logistic regression ====================

    classTrainingSubset1[(classTrainingSubset1 != 1)] = 0
    logisticRegression1 = LogisticRegression(learningRate=0.05,
                                             iterationsToStop=1000,
                                             random_state=1)
    logisticRegression1.learn(dataTrainingSubset1, classTrainingSubset1)
    logisticRegression1.printProbability(dataTrainingSubset1)

    classTrainingSubset2[(classTrainingSubset2 != 1)] = 0
    logisticRegression2 = LogisticRegression(learningRate=0.05,
                                             iterationsToStop=1000,
                                             random_state=1)
    logisticRegression2.learn(dataTrainingSubset2, classTrainingSubset2)
    logisticRegression2.printProbability(dataTrainingSubset2)

    classTrainingSubset3[(classTrainingSubset3 != 1)] = 0
    logisticRegression3 = LogisticRegression(learningRate=0.15,
                                             iterationsToStop=1500,
                                             random_state=1)
    logisticRegression3.learn(dataTrainingSubset3, classTrainingSubset3)
    logisticRegression3.printProbability(dataTrainingSubset3)

    multiLogisticRegression = MultiLogisticRegression(logisticRegression1,
                                                      logisticRegression2,
                                                      logisticRegression3)

    plot_decision_regions(X=dataTestSet,
                          y=classTestSet,
                          classifier=multiLogisticRegression)
    plt.xlabel(r'$x_1$')
    plt.ylabel(r'$x_2$')
    plt.title('Logistic regression')
    plt.legend(loc='lower right')
    plt.show()