def __init__(self): self.corpus = list() # save the corpus for training self.tag = list() # the tag of corpus self.corpus_num = 0 self.state = ['B', 'M', 'E', 'S'] self.perceptron = MP() self.dict = Dict() self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0} self.trans_prb = { 'B': { 'B': 0, 'M': 0, 'E': 0, 'S': 0 }, 'M': { 'B': 0, 'M': 0, 'E': 0, 'S': 0 }, 'E': { 'B': 0, 'M': 0, 'E': 0, 'S': 0 }, 'S': { 'B': 0, 'M': 0, 'E': 0, 'S': 0 } } self.dimension = 0 self.unigram_feat_num = 0 self.unigram_feat_id = {} self.bigram_feat_num = 0 self.bigram_feat_id = {} self.dict_feat_num = 0 self.dict_feat_id = {} self.type_feat_num = 5**5 self.path = r'./' self.nums = [] self.dates = [u"年", u"月", u"日"] self.names = [] inputs1 = codecs.open(r'./resources/Chinese_num.txt', 'r') for line in inputs1.readlines(): rawText = line.strip().split() for w in rawText: self.nums.append(w) inputs1.close() inputs2 = codecs.open(r'./resources/names.txt', 'r') for line in inputs2.readlines(): rawText = line.strip().split() for w in rawText: self.names.append(w) inputs2.close()
def __init__(self): self.corpus = list() # save the corpus for training self.tag = list() # the tag of corpus self.corpus_num = 0 self.state = ['B', 'M', 'E', 'S'] self.perceptron = MP() self.dict = Dict() self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0} self.trans_prb = { 'B': {'B': 0, 'M': 0, 'E': 0, 'S': 0}, 'M': {'B': 0, 'M': 0, 'E': 0, 'S': 0}, 'E': {'B': 0, 'M': 0, 'E': 0, 'S': 0}, 'S': {'B': 0, 'M': 0, 'E': 0, 'S': 0} } self.dimension = 0 self.unigram_feat_num = 0 self.unigram_feat_id = {} self.bigram_feat_num = 0 self.bigram_feat_id = {} self.trigram_feat_num = 0 self.trigram_feat_id = {} self.dict_feat_num = 0 self.dict_feat_id = {} self.type_feat_num = 5**5 self.path = r'./' self.nums = [] self.dates = [u"年", u"月", u"日"] self.names = [] inputs1 = codecs.open(r'./resources/Chinese_num.txt', 'r') for line in inputs1.readlines(): rawText = line.strip().split() for w in rawText: self.nums.append(w) inputs1.close() inputs2 = codecs.open(r'./resources/names.txt', 'r') for line in inputs2.readlines(): rawText = line.strip().split() for w in rawText: self.names.append(w) inputs2.close()
def __init__(self): self.corpus = list() # save the corpus for training self.tag = list() # the tag of corpus self.corpus_num = 0 self.state = ["B", "M", "E", "S"] self.perceptron = MP() self.dict = Dict() self.init_prb = {"B": 0, "M": 0, "E": 0, "S": 0} self.trans_prb = { "B": {"B": 0, "M": 0, "E": 0, "S": 0}, "M": {"B": 0, "M": 0, "E": 0, "S": 0}, "E": {"B": 0, "M": 0, "E": 0, "S": 0}, "S": {"B": 0, "M": 0, "E": 0, "S": 0}, } self.dimension = 0 self.unigram_feat_num = 0 self.unigram_feat_id = {} self.bigram_feat_num = 0 self.bigram_feat_id = {} self.dict_feat_num = 0 self.dict_feat_id = {} self.type_feat_num = 5 ** 5 self.path = r"./" self.nums = [] self.dates = [u"年", u"月", u"日"] self.names = [] inputs1 = codecs.open(r"./resources/Chinese_num.txt", "r") for line in inputs1.readlines(): rawText = line.strip().split() for w in rawText: self.nums.append(w) inputs1.close() inputs2 = codecs.open(r"./resources/names.txt", "r") for line in inputs2.readlines(): rawText = line.strip().split() for w in rawText: self.names.append(w) inputs2.close()
class CWSPerceptron: def __init__(self): self.corpus = list() # save the corpus for training self.tag = list() # the tag of corpus self.corpus_num = 0 self.state = ['B', 'M', 'E', 'S'] self.perceptron = MP() self.dict = Dict() self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0} self.trans_prb = { 'B': { 'B': 0, 'M': 0, 'E': 0, 'S': 0 }, 'M': { 'B': 0, 'M': 0, 'E': 0, 'S': 0 }, 'E': { 'B': 0, 'M': 0, 'E': 0, 'S': 0 }, 'S': { 'B': 0, 'M': 0, 'E': 0, 'S': 0 } } self.dimension = 0 self.unigram_feat_num = 0 self.unigram_feat_id = {} self.bigram_feat_num = 0 self.bigram_feat_id = {} self.trigram_feat_num = 0 self.trigram_feat_id = {} self.dict_feat_num = 0 self.dict_feat_id = {} self.type_feat_num = 5**5 self.path = r'./' self.nums = [] self.dates = [u"年", u"月", u"日"] self.names = [] inputs1 = codecs.open(r'./resources/Chinese_num.txt', 'r') for line in inputs1.readlines(): rawText = line.strip().split() for w in rawText: self.nums.append(w) inputs1.close() inputs2 = codecs.open(r'./resources/names.txt', 'r') for line in inputs2.readlines(): rawText = line.strip().split() for w in rawText: self.names.append(w) inputs2.close() def setSavePath(self, path): self.path = path self.perceptron.setSavePath(path) def saveModel(self): print "Saving the unigram&bigram infomation......" output1 = open(self.path + r"bigram_feat_id.pkl", 'wb') dump(self.bigram_feat_id, output1, -1) output1.close() output2 = open(self.path + r"unigram_feat_id.pkl", 'wb') dump(self.unigram_feat_id, output2, -1) output2.close() output3 = open(self.path + r"trigram_feat_id.pkl", 'wb') dump(self.trigram_feat_id, output3, -1) output3.close() output4 = open(self.path + r"dict_feat_id.pkl", 'wb') dump(self.dict_feat_id, output4, -1) output4.close() # release the memory self.unigram_feat_id = [] self.bigram_feat_id = [] self.trigram_feat_id = [] self.corpus = [] self.tag = [] print "Saving the inital prb & trans prb infomation....." output1 = open(self.path + r"init_prb.pkl", 'wb') dump(self.init_prb, output1, -1) output1.close() output2 = open(self.path + r"trans_prb.pkl", 'wb') dump(self.trans_prb, output2, -1) output2.close() print "Saving process done." def loadModel(self): print "Loading the unigram&bigram infomation......" inputs = open(self.path + r"bigram_feat_id.pkl", 'rb') self.bigram_feat_id = load(inputs) self.bigram_feat_num = len(self.bigram_feat_id) inputs.close() inputs1 = open(self.path + r"unigram_feat_id.pkl", 'rb') self.unigram_feat_id = load(inputs1) self.unigram_feat_num = len(self.unigram_feat_id) inputs1.close() inputs2 = open(self.path + r"dict_feat_id.pkl", 'rb') self.dict_feat_id = load(inputs2) self.dict_feat_num = len(self.dict_feat_id) inputs2.close() inputs3 = open(self.path + r"trigram_feat_id.pkl", 'rb') self.trigram_feat_id = load(inputs3) self.trigram_feat_num = len(self.trigram_feat_id) inputs3.close() # print "Loading process done." print "Loading the prb infomation......" inputs = open(self.path + r"init_prb.pkl", 'rb') self.init_prb = load(inputs) inputs.close() inputs1 = open(self.path + r"trans_prb.pkl", 'rb') self.trans_prb = load(inputs1) inputs1.close() print "Loading process done." self.dimension = self.unigram_feat_num * 5 + self.bigram_feat_num * 4\ + self.trigram_feat_num + self.dict_feat_num * 4 + self.type_feat_num def loadDict(self, dictfile): self.dict.loadDict(dictfile) def saveDict(self, outfile): self.dict.saveDict(outfile) def readDict(self, dictfile): self.dict.readDict(dictfile) def appendDict(self, dictfile): self.dict.appendDict(dictfile) def segmentation(self, outfile): output = codecs.open(outfile, 'w', 'utf-8') start = time.clock() for i in range(self.corpus_num): taglist = self.ViterbiDecode(self.corpus[i]) wordlist = self.tag2word(self.corpus[i], taglist) for j in range(len(wordlist)): output.write(wordlist[j]) output.write(' ') output.write("\n") print "Decode:", time.clock() - start output.close() def train(self, trainfile, batch_num=100, max_iter=200, learn_rate=1.0, delta_thrd=0.001, is_average=True): # self.makelibsvmdata(r'train.data',max_corpus) print "Start training process." self.perceptron.loadFeatSize(self.dimension, len(self.state)) self.perceptron.read_train_file(trainfile) self.perceptron.printinfo() self.perceptron.train_sgd(max_iter, learn_rate, delta_thrd, is_average) self.perceptron.saveModel() print "Training process done." print "Multi-class Perceptron Model had been saved." def printstr(self, wordlist): for item in wordlist: print item print " " def makeLibSvmData(self, output_file, corpus_num=-1): print "Making training data.", filecount = 1 output_data = codecs.open(output_file, 'w') if corpus_num == -1: corpus_num = self.corpus_num for i in range(corpus_num): taglist = self.tag[i] features = self.GetFeature(self.corpus[i]) vec = self.Feature2Vec(features) for j in range(len(taglist)): output_data.write(str(self.state.index(taglist[j]))) output_data.write('\t') keyset = list(vec[j].keys()) keyset = sorted(keyset) if len(keyset) < 1: output_data.write('0:1') for key in keyset: output_data.write(str(key)) output_data.write(':') output_data.write(str(vec[j][key])) output_data.write(' ') output_data.write("\n") output_data.close() print "\nMaking training data finished." return filecount def classifiy_score(self, featureVec): tmp = self.perceptron.scoreout(featureVec) ans = {} for key in tmp.keys(): ans[self.state[int(key)]] = tmp[key] # return self.perceptron.scoreout(featureVec) return ans # return self.perceptron.probout(featureVec) def getEmitPrb(self, score): """ Get emits_prb use softmax function """ max_score = max(score.values()) emit_prb = {} expsum = 0. for key in score.keys(): emit_prb[key] = math.exp(score[key] - max_score) expsum += emit_prb[key] for key in score.keys(): emit_prb[key] /= expsum emit_prb[key] = math.log(emit_prb[key]) return emit_prb def ViterbiDecode(self, sentence): N = len(sentence) # length of the sentence prb = 0. prb_max = 0. toward = list() back = list() # get the feature Vector of every single character features = self.GetFeature(sentence) vec = self.Feature2Vec(features) for i in range(N): toward.append({}) back.append({}) for j in self.state: toward[i][j] = float('-inf') back[i][j] = ' ' # run viterbi score = self.classifiy_score(vec[0]) emit_prb = self.getEmitPrb(score) # print emit_prb for s in self.state: toward[0][s] = self.init_prb[s] + emit_prb[s] back[0][s] = 'end' # toward algorithm for t in range(1, N): score = self.classifiy_score(vec[t]) # print score emit_prb = self.getEmitPrb(score) for s in self.state: prb = float('-inf') prb_max = float('-inf') state_max = 'S' for i in self.state: prb = toward[t - 1][i] + self.trans_prb[i][s] + emit_prb[s] if prb > prb_max: prb_max = prb state_max = i toward[t][s] = prb_max back[t][s] = state_max # backward algorithm to get the best tag sequence index = N - 1 taglist = [] prb_max = float('-inf') state_max = '' for s in self.state: prb = toward[N - 1][s] if prb > prb_max: prb_max = prb state_max = s taglist.append(state_max) while index >= 1: pre_state = back[index][taglist[0]] taglist.insert(0, pre_state) index -= 1 if taglist[-1] == 'B': taglist[-1] = 'S' elif taglist[-1] == 'M': taglist[-1] == 'E' return taglist def GetFeature(self, sent): """ get feature for every single character return a list of features """ features = [] for i in range(len(sent)): left2 = sent[i - 2] if i - 2 >= 0 else '#' left1 = sent[i - 1] if i - 1 >= 0 else '#' mid = sent[i] right1 = sent[i + 1] if i + 1 < len(sent) else '#' right2 = sent[i + 2] if i + 2 < len(sent) else '#' # print self.dict.dic.has_key(mid), if self.dict.dic.has_key(mid): MWL = str(self.dict.dic[mid][0]) t0 = self.dict.dic[mid][1] # print MWL,t0 else: MWL = '0' t0 = '#' # print MWL,t0 featcode = 0 chars = [left2, left1, mid, right1, right2] for i in range(len(chars)): if chars[i].encode('utf-8') in self.nums: featcode += 0 elif chars[i] in self.dates: featcode += 5**i elif (u"a" <= chars[i] and chars[i] <= u"z") or (u"A" <= chars[i] and chars[i] <= u"Z"): featcode += 5**i * 2 elif chars[i].encode('utf-8') in self.names: featcode += 5**i * 3 else: featcode += 5**i * 4 featcode += 1 feat = [ left2, left1, mid, right1, right2, left2 + left1, left1 + mid, mid + right1, right1 + right2, left1 + right1, MWL + t0, left1 + t0, mid + t0, right1 + t0, featcode ] features.append(feat) return features def Feature2Vec(self, feats): """ get feature vector from feature the paramters feats mean is a list of features of every character """ punctuation = [ u'。', u',', u'?', u'!', u'、', u';', u':', u'「', '」', u'『', u'』', u'‘', u'’', u'“', u'”', u'(', u')', u'〔', u'〕', u'【', u'】', u'——', u'–', u'…', u'.', u'·', u'《', u'》', u'〈', u'〉' ] featVecs = [] for feat in feats: featVec = {} # if feat[2] in punctuation: # featVec[0] = 1 for it in range(len(feat)): if it < 5: if self.unigram_feat_id.has_key(feat[it]): key = self.unigram_feat_id[ feat[it]] + self.unigram_feat_num * it featVec[key] = 1 elif it < 9: if self.bigram_feat_id.has_key(feat[it]): key = self.bigram_feat_id[feat[it]] key += self.unigram_feat_num * 5 + \ self.bigram_feat_num * (it - 4) featVec[key] = 1 elif it < 10: if self.trigram_feat_id.has_key(feat[it]): key = self.trigram_feat_id[feat[it]] key += self.unigram_feat_num * 5 + self.bigram_feat_num * 4 elif it < 14: if self.dict_feat_id.has_key(feat[it]): key = self.dict_feat_id[feat[it]] key += self.unigram_feat_num * 5 + self.bigram_feat_num * 4 + self.trigram_feat_num + self.dict_feat_num * ( it - 10) featVec[key] = 1 else: key = feat[it] key += self.unigram_feat_num * 5 + self.bigram_feat_num * 4 + self.trigram_feat_num + self.dict_feat_num * 4 featVec[key] = 1 featVecs.append(featVec) return featVecs def getTag(self, wordlist): """get the tag for every char in the word""" taglist = [] for word in wordlist: if len(word) == 1: taglist.append('S') else: taglist.append('B') for w in word[1:len(word) - 1]: taglist.append('M') taglist.append('E') return taglist def tag2word(self, sentence, taglist): wordlist = [] tmp = '' for i in range(len(taglist)): if taglist[i] == 'S': tmp = sentence[i] wordlist.append(tmp) tmp = '' elif taglist[i] == 'B': tmp += sentence[i] elif taglist[i] == 'M': tmp += sentence[i] else: tmp += sentence[i] wordlist.append(tmp) tmp = '' return wordlist def loadCorpus(self, corpus_file): print "Loading Corpus data", input_data = codecs.open(corpus_file, 'r', 'utf-8') for line in input_data.readlines(): rawText = line.strip() if rawText == '': continue else: self.corpus_num += 1 if self.corpus_num % 1000 == 0 and self.corpus_num != 0: print '.', wordlist = rawText.split() taglist = self.getTag(wordlist) self.tag.append(taglist) # add to y, i.d. the tags list sentence = "".join(wordlist) self.corpus.append(sentence) # add to x, i.d. the corpus print "\nLoading Corpus done." def pretreatment(self, train_file): print "The process of corpus Pretreatment", input_data = codecs.open(train_file, 'r', 'utf-8') for line in input_data.readlines(): rawText = line.strip() if rawText == '': continue else: self.corpus_num += 1 if self.corpus_num % 1000 == 0 and self.corpus_num != 0: print '.', wordlist = rawText.split() taglist = self.getTag(wordlist) self.tag.append(taglist) # add to y, i.d. the tags list sentence = "".join(wordlist) self.corpus.append(sentence) # add to x, i.d. the corpus self.init_prb[taglist[0]] += 1 for t in range(1, len(taglist)): self.trans_prb[taglist[t - 1]][taglist[t]] += 1 feats = self.GetFeature(sentence) # record the feats, allocate the id of feature for feat in feats: for it in range(len(feat)): if it < 5: # unigram feature if not self.unigram_feat_id.has_key(feat[it]): self.unigram_feat_num += 1 self.unigram_feat_id[ feat[it]] = self.unigram_feat_num elif it < 9: # bigram feature if not self.bigram_feat_id.has_key(feat[it]): self.bigram_feat_num += 1 self.bigram_feat_id[ feat[it]] = self.bigram_feat_num elif it < 10: # trigram feature if not self.trigram_feat_id.has_key(feat[it]): self.trigram_feat_num += 1 self.trigram_feat_id[ feat[it]] = self.trigram_feat_num elif it < 14: # dictionary information feature if not self.dict_feat_id.has_key(feat[it]): self.dict_feat_num += 1 self.dict_feat_id[feat[it]] = self.dict_feat_num # calculate the probability of tag initsum = sum(self.init_prb.values()) for key in self.init_prb.keys(): self.init_prb[key] = float(self.init_prb[key]) / initsum for x in self.trans_prb.keys(): tmpsum = sum(self.trans_prb[x].values()) for y in self.trans_prb[x].keys(): self.trans_prb[x][y] = float(self.trans_prb[x][y]) / tmpsum self.dimension = self.unigram_feat_num * 5 + \ self.bigram_feat_num * 4 + self.trigram_feat_num + self.dict_feat_num * 4 + self.type_feat_num # calc the log probability for s in self.state: if self.init_prb[s] != 0.: self.init_prb[s] = math.log(self.init_prb[s]) else: self.init_prb[s] = float('-inf') for j in self.state: if self.trans_prb[s][j] != 0.: self.trans_prb[s][j] = math.log(self.trans_prb[s][j]) else: self.trans_prb[s][j] = float('-inf') print "\nProcess of pretreatment finished."
class CWSPerceptron: def __init__(self): self.corpus = list() # save the corpus for training self.tag = list() # the tag of corpus self.corpus_num = 0 self.state = ["B", "M", "E", "S"] self.perceptron = MP() self.dict = Dict() self.init_prb = {"B": 0, "M": 0, "E": 0, "S": 0} self.trans_prb = { "B": {"B": 0, "M": 0, "E": 0, "S": 0}, "M": {"B": 0, "M": 0, "E": 0, "S": 0}, "E": {"B": 0, "M": 0, "E": 0, "S": 0}, "S": {"B": 0, "M": 0, "E": 0, "S": 0}, } self.dimension = 0 self.unigram_feat_num = 0 self.unigram_feat_id = {} self.bigram_feat_num = 0 self.bigram_feat_id = {} self.dict_feat_num = 0 self.dict_feat_id = {} self.type_feat_num = 5 ** 5 self.path = r"./" self.nums = [] self.dates = [u"年", u"月", u"日"] self.names = [] inputs1 = codecs.open(r"./resources/Chinese_num.txt", "r") for line in inputs1.readlines(): rawText = line.strip().split() for w in rawText: self.nums.append(w) inputs1.close() inputs2 = codecs.open(r"./resources/names.txt", "r") for line in inputs2.readlines(): rawText = line.strip().split() for w in rawText: self.names.append(w) inputs2.close() def setSavePath(self, path): self.path = path self.perceptron.setSavePath(path) def saveModel(self): print "Saving the unigram&bigram infomation......" output1 = open(self.path + r"bigram_feat_id.pkl", "wb") dump(self.bigram_feat_id, output1, -1) output1.close() output2 = open(self.path + r"unigram_feat_id.pkl", "wb") dump(self.unigram_feat_id, output2, -1) output2.close() output3 = open(self.path + r"dict_feat_id.pkl", "wb") dump(self.dict_feat_id, output3, -1) output3.close() # release the memory self.unigram_feat_id = [] self.bigram_feat_id = [] self.corpus = [] self.tag = [] print "Saving the inital prb & trans prb infomation....." output1 = open(self.path + r"init_prb.pkl", "wb") dump(self.init_prb, output1, -1) output1.close() output2 = open(self.path + r"trans_prb.pkl", "wb") dump(self.trans_prb, output2, -1) output2.close() print "Saving process done." def loadModel(self): print "Loading the unigram&bigram infomation......" inputs = open(self.path + r"bigram_feat_id.pkl", "rb") self.bigram_feat_id = load(inputs) self.bigram_feat_num = len(self.bigram_feat_id) inputs.close() inputs1 = open(self.path + r"unigram_feat_id.pkl", "rb") self.unigram_feat_id = load(inputs1) self.unigram_feat_num = len(self.unigram_feat_id) inputs1.close() inputs2 = open(self.path + r"dict_feat_id.pkl", "rb") self.dict_feat_id = load(inputs2) self.dict_feat_num = len(self.dict_feat_id) # print "Loading process done." print "Loading the prb infomation......" inputs = open(self.path + r"init_prb.pkl", "rb") self.init_prb = load(inputs) inputs.close() inputs1 = open(self.path + r"trans_prb.pkl", "rb") self.trans_prb = load(inputs1) inputs1.close() print "Loading process done." self.dimension = ( self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num ) def loadDict(self, dictfile): self.dict.loadDict(dictfile) def saveDict(self, outfile): self.dict.saveDict(outfile) def readDict(self, dictfile): self.dict.readDict(dictfile) def appendDict(self, dictfile): self.dict.appendDict(dictfile) def evaluate(self, corpus=200): error_count = 0 tagnums = sum([len(item) for item in self.tag[0:corpus]]) for i in range(corpus): tag = self.ViterbiDecode(self.corpus[i]) # print 'y:',self.tag[i] # print 'p:',tag for index in range(len(tag)): pre = tag[index] # print self.tag[j] real = self.tag[i][index] # print pre, real if pre != real: error_count += 1 return 1 - float(error_count) / tagnums def segmentation(self, outfile): output = codecs.open(outfile, "w", "utf-8") start = time.clock() for i in range(self.corpus_num): taglist = self.ViterbiDecode(self.corpus[i]) wordlist = self.tag2word(self.corpus[i], taglist) for j in range(len(wordlist)): output.write(wordlist[j]) output.write(" ") output.write("\n") print "Decode:", time.clock() - start output.close() def train(self, trainfile, batch_num=100, max_iter=200, learn_rate=1.0, delta_thrd=0.001, is_average=True): # self.makelibsvmdata(r'train.data',max_corpus) print "Start training process." self.perceptron.loadFeatSize(self.dimension, len(self.state)) self.perceptron.read_train_file(trainfile) self.perceptron.printinfo() self.perceptron.train_sgd(max_iter, learn_rate, delta_thrd, is_average) self.perceptron.saveModel() print "Training process done." print "Multi-class Perceptron Model had been saved." def printstr(self, wordlist): for item in wordlist: print item print " " def makeLibSvmData(self, output_file, corpus_num=-1): print "Making training data.", filecount = 1 output_data = codecs.open(output_file, "w") if corpus_num == -1: corpus_num = self.corpus_num for i in range(corpus_num): taglist = self.tag[i] features = self.GetFeature(self.corpus[i]) vec = self.Feature2Vec(features) for j in range(len(taglist)): output_data.write(str(self.state.index(taglist[j]))) output_data.write("\t") keyset = list(vec[j].keys()) keyset = sorted(keyset) if len(keyset) < 1: output_data.write("0:1") for key in keyset: output_data.write(str(key)) output_data.write(":") output_data.write(str(vec[j][key])) output_data.write(" ") output_data.write("\n") output_data.close() print "\nMaking training data finished." return filecount def classifiy_score(self, featureVec): tmp = self.perceptron.scoreout(featureVec) ans = {} for key in tmp.keys(): ans[self.state[int(key)]] = tmp[key] # return self.perceptron.scoreout(featureVec) return ans # return self.perceptron.probout(featureVec) def getEmitPrb(self, score): """ Get emits_prb use softmax function """ max_score = max(score.values()) emit_prb = {} expsum = 0.0 for key in score.keys(): emit_prb[key] = math.exp(score[key] - max_score) expsum += emit_prb[key] for key in score.keys(): emit_prb[key] /= expsum emit_prb[key] = math.log(emit_prb[key]) return emit_prb def ViterbiDecode(self, sentence): N = len(sentence) # length of the sentence prb = 0.0 prb_max = 0.0 toward = list() back = list() # get the feature Vector of every single character features = self.GetFeature(sentence) vec = self.Feature2Vec(features) for i in range(N): toward.append({}) back.append({}) for j in self.state: toward[i][j] = float("-inf") back[i][j] = " " # run viterbi score = self.classifiy_score(vec[0]) emit_prb = self.getEmitPrb(score) # print emit_prb for s in self.state: toward[0][s] = self.init_prb[s] + emit_prb[s] back[0][s] = "end" # toward algorithm for t in range(1, N): score = self.classifiy_score(vec[t]) # print score emit_prb = self.getEmitPrb(score) for s in self.state: prb = float("-inf") prb_max = float("-inf") state_max = "S" for i in self.state: prb = toward[t - 1][i] + self.trans_prb[i][s] + emit_prb[s] if prb > prb_max: prb_max = prb state_max = i toward[t][s] = prb_max back[t][s] = state_max # backward algorithm to get the best tag sequence index = N - 1 taglist = [] prb_max = float("-inf") state_max = "" for s in self.state: prb = toward[N - 1][s] if prb > prb_max: prb_max = prb state_max = s taglist.append(state_max) while index >= 1: pre_state = back[index][taglist[0]] taglist.insert(0, pre_state) index -= 1 if taglist[-1] == "B": taglist[-1] = "S" elif taglist[-1] == "M": taglist[-1] == "E" return taglist def GetFeature(self, sent): """ get feature for every single character return a list of features """ features = [] for i in range(len(sent)): left2 = sent[i - 2] if i - 2 >= 0 else "#" left1 = sent[i - 1] if i - 1 >= 0 else "#" mid = sent[i] right1 = sent[i + 1] if i + 1 < len(sent) else "#" right2 = sent[i + 2] if i + 2 < len(sent) else "#" # print self.dict.dic.has_key(mid), if self.dict.dic.has_key(mid): MWL = str(self.dict.dic[mid][0]) t0 = self.dict.dic[mid][1] # print MWL,t0 else: MWL = "0" t0 = "#" # print MWL,t0 featcode = 0 chars = [left2, left1, mid, right1, right2] for i in range(len(chars)): if chars[i].encode("utf-8") in self.nums: featcode += 0 elif chars[i] in self.dates: featcode += 5 ** i elif (u"a" <= chars[i] and chars[i] <= u"z") or (u"A" <= chars[i] and chars[i] <= u"Z"): featcode += 5 ** i * 2 elif chars[i].encode("utf-8") in self.names: featcode += 5 ** i * 3 else: featcode += 5 ** i * 4 featcode += 1 feat = [ left2, left1, mid, right1, right2, left2 + left1, left1 + mid, mid + right1, right1 + right2, left1 + right1, MWL + t0, left1 + t0, mid + t0, right1 + t0, featcode, ] features.append(feat) return features def Feature2Vec(self, feats): """ get feature vector from feature the paramters feats mean is a list of features of every character """ punctuation = [ u"。", u",", u"?", u"!", u"、", u";", u":", u"「", "」", u"『", u"』", u"‘", u"’", u"“", u"”", u"(", u")", u"〔", u"〕", u"【", u"】", u"——", u"–", u"…", u".", u"·", u"《", u"》", u"〈", u"〉", ] featVecs = [] for feat in feats: featVec = {} # if feat[2] in punctuation: # featVec[0] = 1 for it in range(len(feat)): if it < 5: if self.unigram_feat_id.has_key(feat[it]): key = self.unigram_feat_id[feat[it]] + self.unigram_feat_num * it featVec[key] = 1 elif it < 10: if self.bigram_feat_id.has_key(feat[it]): key = self.bigram_feat_id[feat[it]] key += self.unigram_feat_num * 5 + self.bigram_feat_num * (it - 5) featVec[key] = 1 elif it < 14: if self.dict_feat_id.has_key(feat[it]): key = self.dict_feat_id[feat[it]] key += self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * (it - 10) featVec[key] = 1 else: key = feat[it] key += self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * 4 featVec[key] = 1 featVecs.append(featVec) return featVecs def getTag(self, wordlist): """get the tag for every char in the word""" taglist = [] for word in wordlist: if len(word) == 1: taglist.append("S") else: taglist.append("B") for w in word[1 : len(word) - 1]: taglist.append("M") taglist.append("E") return taglist def tag2word(self, sentence, taglist): wordlist = [] tmp = "" for i in range(len(taglist)): if taglist[i] == "S": tmp = sentence[i] wordlist.append(tmp) tmp = "" elif taglist[i] == "B": tmp += sentence[i] elif taglist[i] == "M": tmp += sentence[i] else: tmp += sentence[i] wordlist.append(tmp) tmp = "" return wordlist def loadTestCorpus(self, corpus_file): print "Loading Test Corpus data", input_data = codecs.open(corpus_file, "r", "utf-8") for line in input_data.readlines(): rawText = line.strip() if rawText == "": continue else: self.corpus_num += 1 if self.corpus_num % 1000 == 0 and self.corpus_num != 0: print ".", wordlist = rawText.split() sentence = "".join(wordlist) self.corpus.append(sentence) # add to x, i.d. the corpus print "\nLoading Test Corpus done." def loadCorpus(self, corpus_file): print "Loading Corpus data", input_data = codecs.open(corpus_file, "r", "utf-8") for line in input_data.readlines(): rawText = line.strip() if rawText == "": continue else: self.corpus_num += 1 if self.corpus_num % 1000 == 0 and self.corpus_num != 0: print ".", wordlist = rawText.split() taglist = self.getTag(wordlist) self.tag.append(taglist) # add to y, i.d. the tags list sentence = "".join(wordlist) self.corpus.append(sentence) # add to x, i.d. the corpus print "\nLoading Corpus done." def pretreatment(self, train_file): print "The process of corpus Pretreatment", input_data = codecs.open(train_file, "r", "utf-8") for line in input_data.readlines(): rawText = line.strip() if rawText == "": continue else: self.corpus_num += 1 if self.corpus_num % 1000 == 0 and self.corpus_num != 0: print ".", wordlist = rawText.split() taglist = self.getTag(wordlist) self.tag.append(taglist) # add to y, i.d. the tags list sentence = "".join(wordlist) self.corpus.append(sentence) # add to x, i.d. the corpus self.init_prb[taglist[0]] += 1 for t in range(1, len(taglist)): self.trans_prb[taglist[t - 1]][taglist[t]] += 1 feats = self.GetFeature(sentence) # record the feats, allocate the id of feature for feat in feats: for it in range(len(feat)): if it < 5: # unigram feature if not self.unigram_feat_id.has_key(feat[it]): self.unigram_feat_num += 1 self.unigram_feat_id[feat[it]] = self.unigram_feat_num elif it < 10: # bigram feature if not self.bigram_feat_id.has_key(feat[it]): self.bigram_feat_num += 1 self.bigram_feat_id[feat[it]] = self.bigram_feat_num elif it < 14: # dictionary information feature if not self.dict_feat_id.has_key(feat[it]): self.dict_feat_num += 1 self.dict_feat_id[feat[it]] = self.dict_feat_num # calculate the probability of tag initsum = sum(self.init_prb.values()) for key in self.init_prb.keys(): self.init_prb[key] = float(self.init_prb[key]) / initsum for x in self.trans_prb.keys(): tmpsum = sum(self.trans_prb[x].values()) for y in self.trans_prb[x].keys(): self.trans_prb[x][y] = float(self.trans_prb[x][y]) / tmpsum self.dimension = ( self.unigram_feat_num * 5 + self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num ) # calc the log probability for s in self.state: if self.init_prb[s] != 0.0: self.init_prb[s] = math.log(self.init_prb[s]) else: self.init_prb[s] = float("-inf") for j in self.state: if self.trans_prb[s][j] != 0.0: self.trans_prb[s][j] = math.log(self.trans_prb[s][j]) else: self.trans_prb[s][j] = float("-inf") print "\nProcess of pretreatment finished."
class CWSPerceptron: def __init__(self): self.corpus = list() # save the corpus for training self.tag = list() # the tag of corpus self.corpus_num = 0 self.state = ['B', 'M', 'E', 'S'] self.perceptron = MP() self.dict = Dict() self.init_prb = {'B': 0, 'M': 0, 'E': 0, 'S': 0} self.trans_prb = { 'B': {'B': 0, 'M': 0, 'E': 0, 'S': 0}, 'M': {'B': 0, 'M': 0, 'E': 0, 'S': 0}, 'E': {'B': 0, 'M': 0, 'E': 0, 'S': 0}, 'S': {'B': 0, 'M': 0, 'E': 0, 'S': 0} } self.dimension = 0 self.unigram_feat_num = 0 self.unigram_feat_id = {} self.bigram_feat_num = 0 self.bigram_feat_id = {} self.dict_feat_num = 0 self.dict_feat_id = {} self.type_feat_num = 5**5 self.path = r'./' self.nums = [] self.dates = [u"年", u"月", u"日"] self.names = [] inputs1 = codecs.open(r'Chinese_num.txt', 'r') for line in inputs1.readlines(): rawText = line.strip().split() for w in rawText: self.nums.append(w) inputs1.close() inputs2 = codecs.open(r'names.txt', 'r') for line in inputs2.readlines(): rawText = line.strip().split() for w in rawText: self.names.append(w) inputs2.close() def setSavePath(self, path): self.path = path self.perceptron.setSavePath(path) def saveModel(self): print "Saving the unigram&bigram infomation......" output1 = open(self.path + r"bigram_feat_id.pkl", 'wb') dump(self.bigram_feat_id, output1, -1) output1.close() output2 = open(self.path + r"unigram_feat_id.pkl", 'wb') dump(self.unigram_feat_id, output2, -1) output2.close() output3 = open(self.path + r"dict_feat_id.pkl", 'wb') dump(self.dict_feat_id, output3, -1) output3.close() # release the memory self.unigram_feat_id = [] self.bigram_feat_id = [] self.corpus = [] self.tag = [] print "Saving the inital prb & trans prb infomation....." output1 = open(self.path + r"init_prb.pkl", 'wb') dump(self.init_prb, output1, -1) output1.close() output2 = open(self.path + r"trans_prb.pkl", 'wb') dump(self.trans_prb, output2, -1) output2.close() print "Saving process done." def loadModel(self): print "Loading the unigram&bigram infomation......" inputs = open(self.path + r"bigram_feat_id.pkl", 'rb') self.bigram_feat_id = load(inputs) self.bigram_feat_num = len(self.bigram_feat_id) inputs.close() inputs1 = open(self.path + r"unigram_feat_id.pkl", 'rb') self.unigram_feat_id = load(inputs1) self.unigram_feat_num = len(self.unigram_feat_id) inputs1.close() inputs2 = open(self.path + r"dict_feat_id.pkl", 'rb') self.dict_feat_id = load(inputs2) self.dict_feat_num = len(self.dict_feat_id) # print "Loading process done." print "Loading the prb infomation......" inputs = open(self.path + r"init_prb.pkl", 'rb') self.init_prb = load(inputs) inputs.close() inputs1 = open(self.path + r"trans_prb.pkl", 'rb') self.trans_prb = load(inputs1) inputs1.close() print "Loading process done." self.dimension = self.unigram_feat_num * 5 + \ self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num def loadDict(self, dictfile): self.dict.loadDict(dictfile) def saveDict(self, outfile): self.dict.saveDict(outfile) def readDict(self, dictfile): self.dict.readDict(dictfile) def appendDict(self, dictfile): self.dict.appendDict(dictfile) def evaluate(self, corpus=200): error_count = 0 tagnums = sum([len(item) for item in self.tag[0:corpus]]) for i in range(corpus): tag = self.ViterbiDecode(self.corpus[i]) # print 'y:',self.tag[i] # print 'p:',tag for index in range(len(tag)): pre = tag[index] # print self.tag[j] real = self.tag[i][index] # print pre, real if pre != real: error_count += 1 return 1 - float(error_count) / tagnums def segmentation(self, outfile): output = codecs.open(outfile, 'w', 'utf-8') start = time.clock() for i in range(self.corpus_num): taglist = self.ViterbiDecode(self.corpus[i]) wordlist = self.tag2word(self.corpus[i], taglist) for j in range(len(wordlist)): output.write(wordlist[j]) output.write(' ') output.write("\n") print "Decode:", time.clock() - start output.close() def train(self, trainfile, batch_num=100, max_iter=200, learn_rate=1.0, delta_thrd=0.001, is_average=True): # self.makelibsvmdata(r'train.data',max_corpus) print "Start training process." self.perceptron.loadFeatSize(self.dimension, len(self.state)) self.perceptron.read_train_file(trainfile) self.perceptron.printinfo() self.perceptron.train_sgd(max_iter, learn_rate, delta_thrd, is_average) self.perceptron.saveModel() print "Training process done." print "Multi-class Perceptron Model had been saved." def printstr(self, wordlist): for item in wordlist: print item print " " def makeLibSvmData(self, output_file, corpus_num=-1): print "Making training data.", filecount = 1 output_data = codecs.open(output_file, 'w') if corpus_num == -1: corpus_num = self.corpus_num for i in range(corpus_num): taglist = self.tag[i] features = self.GetFeature(self.corpus[i]) vec = self.Feature2Vec(features) for j in range(len(taglist)): output_data.write(str(self.state.index(taglist[j]))) output_data.write('\t') keyset = list(vec[j].keys()) keyset = sorted(keyset) if len(keyset) < 1: output_data.write('0:1') for key in keyset: output_data.write(str(key)) output_data.write(':') output_data.write(str(vec[j][key])) output_data.write(' ') output_data.write("\n") output_data.close() print "\nMaking training data finished." return filecount def classifiy_score(self, featureVec): return self.perceptron.scoreout(featureVec) # return self.perceptron.probout(featureVec) def getEmitPrb(self, score): """ Get emits_prb use softmax function """ max_score = max(score.values()) emit_prb = {} expsum = 0. for key in score.keys(): emit_prb[key] = math.exp(score[key] - max_score) expsum += emit_prb[key] for key in score.keys(): emit_prb[key] /= expsum emit_prb[key] = math.log(emit_prb[key]) return emit_prb def ViterbiDecode(self, sentence): N = len(sentence) # length of the sentence prb = 0. prb_max = 0. toward = list() back = list() # get the feature Vector of every single character features = self.GetFeature(sentence) vec = self.Feature2Vec(features) for i in range(N): toward.append({}) back.append({}) for j in self.state: toward[i][j] = float('-inf') back[i][j] = ' ' # run viterbi score = self.classifiy_score(vec[0]) emit_prb = self.getEmitPrb(score) # print emit_prb for s in self.state: toward[0][s] = self.init_prb[s] + emit_prb[s] back[0][s] = 'end' # toward algorithm for t in range(1, N): score = self.classifiy_score(vec[t]) # print score emit_prb = self.getEmitPrb(score) for s in self.state: prb = float('-inf') prb_max = float('-inf') state_max = 'S' for i in self.state: prb = toward[t - 1][i] + self.trans_prb[i][s] + emit_prb[s] if prb > prb_max: prb_max = prb state_max = i toward[t][s] = prb_max back[t][s] = state_max # backward algorithm to get the best tag sequence index = N - 1 taglist = [] prb_max = float('-inf') state_max = '' for s in self.state: prb = toward[N - 1][s] if prb > prb_max: prb_max = prb state_max = s taglist.append(state_max) while index >= 1: pre_state = back[index][taglist[0]] taglist.insert(0, pre_state) index -= 1 if taglist[-1] == 'B': taglist[-1] = 'S' elif taglist[-1] == 'M': taglist[-1] == 'E' return taglist def GetFeature(self, sent): """ get feature for every single character return a list of features """ features = [] for i in range(len(sent)): left2 = sent[i - 2] if i - 2 >= 0 else '#' left1 = sent[i - 1] if i - 1 >= 0 else '#' mid = sent[i] right1 = sent[i + 1] if i + 1 < len(sent) else '#' right2 = sent[i + 2] if i + 2 < len(sent) else '#' # print self.dict.dic.has_key(mid), if self.dict.dic.has_key(mid): MWL = str(self.dict.dic[mid][0]) t0 = self.dict.dic[mid][1] # print MWL,t0 else: MWL = '0' t0 = '#' # print MWL,t0 featcode = 0 chars = [left2, left1, mid, right1, right2] for i in range(len(chars)): if chars[i].encode('utf-8') in self.nums: featcode += 0 elif chars[i] in self.dates: featcode += 5**i elif (u"a" <= chars[i] and chars[i] <= u"z") or (u"A" <= chars[i] and chars[i] <= u"Z"): featcode += 5**i * 2 elif chars[i].encode('utf-8') in self.names: featcode += 5**i * 3 else: featcode += 5**i * 4 featcode += 1 feat = [left2, left1, mid, right1, right2, left2 + left1, left1 + mid, mid + right1, right1 + right2, left1 + right1, MWL + t0, left1 + t0, mid + t0, right1 + t0, featcode] features.append(feat) return features def Feature2Vec(self, feats): """ get feature vector from feature the paramters feats mean is a list of features of every character """ punctuation = [u'。', u',', u'?', u'!', u'、', u';', u':', u'「', '」', u'『', u'』', u'‘', u'’', u'“', u'”', u'(', u')', u'〔', u'〕', u'【', u'】', u'——', u'–', u'…', u'.', u'·', u'《', u'》', u'〈', u'〉'] featVecs = [] for feat in feats: featVec = {} # if feat[2] in punctuation: # featVec[0] = 1 for it in range(len(feat)): if it < 5: if self.unigram_feat_id.has_key(feat[it]): key = self.unigram_feat_id[feat[it]]+self.unigram_feat_num*it featVec[key] = 1 elif it < 10: if self.bigram_feat_id.has_key(feat[it]): key = self.bigram_feat_id[feat[it]] key += self.unigram_feat_num*5 + self.bigram_feat_num*(it-5) featVec[key] = 1 elif it < 14: if self.dict_feat_id.has_key(feat[it]): key = self.dict_feat_id[feat[it]] key += self.unigram_feat_num*5 + self.bigram_feat_num*5 + self.dict_feat_num*(it-10) featVec[key] = 1 else: key = feat[it] key += self.unigram_feat_num*5 + self.bigram_feat_num*5 + self.dict_feat_num*4 featVec[key] = 1 featVecs.append(featVec) return featVecs def getTag(self, wordlist): """get the tag for every char in the word""" taglist = [] for word in wordlist: if len(word) == 1: taglist.append('S') else: taglist.append('B') for w in word[1:len(word) - 1]: taglist.append('M') taglist.append('E') return taglist def tag2word(self, sentence, taglist): wordlist = [] tmp = '' for i in range(len(taglist)): if taglist[i] == 'S': tmp = sentence[i] wordlist.append(tmp) tmp = '' elif taglist[i] == 'B': tmp += sentence[i] elif taglist[i] == 'M': tmp += sentence[i] else: tmp += sentence[i] wordlist.append(tmp) tmp = '' return wordlist def loadTestCorpus(self, corpus_file): print "Loading Test Corpus data", input_data = codecs.open(corpus_file, 'r', 'utf-8') for line in input_data.readlines(): rawText = line.strip() if rawText == '': continue else: self.corpus_num += 1 if self.corpus_num % 1000 == 0 and self.corpus_num != 0: print '.', wordlist = rawText.split() sentence = "".join(wordlist) self.corpus.append(sentence) # add to x, i.d. the corpus print "\nLoading Test Corpus done." def loadCorpus(self, corpus_file): print "Loading Corpus data", input_data = codecs.open(corpus_file, 'r', 'utf-8') for line in input_data.readlines(): rawText = line.strip() if rawText == '': continue else: self.corpus_num += 1 if self.corpus_num % 1000 == 0 and self.corpus_num != 0: print '.', wordlist = rawText.split() taglist = self.getTag(wordlist) self.tag.append(taglist) # add to y, i.d. the tags list sentence = "".join(wordlist) self.corpus.append(sentence) # add to x, i.d. the corpus print "\nLoading Corpus done." def pretreatment(self, train_file): print "The process of corpus Pretreatment", input_data = codecs.open(train_file, 'r', 'utf-8') for line in input_data.readlines(): rawText = line.strip() if rawText == '': continue else: self.corpus_num += 1 if self.corpus_num % 1000 == 0 and self.corpus_num != 0: print '.', wordlist = rawText.split() taglist = self.getTag(wordlist) self.tag.append(taglist) # add to y, i.d. the tags list sentence = "".join(wordlist) self.corpus.append(sentence) # add to x, i.d. the corpus self.init_prb[taglist[0]] += 1 for t in range(1, len(taglist)): self.trans_prb[taglist[t - 1]][taglist[t]] += 1 feats = self.GetFeature(sentence) # record the feats, allocate the id of feature for feat in feats: for it in range(len(feat)): if it < 5: # unigram feature if not self.unigram_feat_id.has_key(feat[it]): self.unigram_feat_num += 1 self.unigram_feat_id[ feat[it]] = self.unigram_feat_num elif it < 10: # bigram feature if not self.bigram_feat_id.has_key(feat[it]): self.bigram_feat_num += 1 self.bigram_feat_id[ feat[it]] = self.bigram_feat_num elif it < 14: # dictionary information feature if not self.dict_feat_id.has_key(feat[it]): self.dict_feat_num += 1 self.dict_feat_id[feat[it]] = self.dict_feat_num # calculate the probability of tag initsum = sum(self.init_prb.values()) for key in self.init_prb.keys(): self.init_prb[key] = float(self.init_prb[key]) / initsum for x in self.trans_prb.keys(): tmpsum = sum(self.trans_prb[x].values()) for y in self.trans_prb[x].keys(): self.trans_prb[x][y] = float(self.trans_prb[x][y]) / tmpsum self.dimension = self.unigram_feat_num * 5 + \ self.bigram_feat_num * 5 + self.dict_feat_num * 4 + self.type_feat_num # calc the log probability for s in self.state: if self.init_prb[s] != 0.: self.init_prb[s] = math.log(self.init_prb[s]) else: self.init_prb[s] = float('-inf') for j in self.state: if self.trans_prb[s][j] != 0.: self.trans_prb[s][j] = math.log(self.trans_prb[s][j]) else: self.trans_prb[s][j] = float('-inf') print "\nProcess of pretreatment finished."
def main(): iris = datasets.load_iris() irisData = iris.data[:, [2, 3]] irisClass = iris.target dataTrainingSet, dataTestSet, classTrainingSet, classTestSet = train_test_split( irisData, irisClass, test_size=0.3, random_state=1, stratify=irisClass) # =============== Perceptron ==================== # Perceptron 1 classTrainingSubset1 = np.copy(classTrainingSet) classTrainingSubset1 = classTrainingSubset1[(classTrainingSubset1 != 2)] dataTrainingSubset1 = np.copy(dataTrainingSet) dataTrainingSubset1 = dataTrainingSubset1[(classTrainingSet != 2)] classTrainingSubset1[(classTrainingSubset1 != 0)] = -1 classTrainingSubset1[(classTrainingSubset1 != -1)] = 1 perceptron1 = Perceptron(learningRate=0.1, iterationsToStop=10) perceptron1.learn(dataTrainingSubset1, classTrainingSubset1) # Perceptron 2 classTrainingSubset2 = np.copy(classTrainingSet) classTrainingSubset2 = classTrainingSubset2[(classTrainingSubset2 != 1)] dataTrainingSubset2 = np.copy(dataTrainingSet) dataTrainingSubset2 = dataTrainingSubset2[(classTrainingSet != 1)] classTrainingSubset2[(classTrainingSubset2 != 2)] = -1 classTrainingSubset2[(classTrainingSubset2 != -1)] = 1 perceptron2 = Perceptron(learningRate=0.1, iterationsToStop=10) perceptron2.learn(dataTrainingSubset2, classTrainingSubset2) # Perceptron 3 classTrainingSubset3 = np.copy(classTrainingSet) classTrainingSubset3 = classTrainingSubset3[(classTrainingSubset3 != 0)] dataTrainingSubset3 = np.copy(dataTrainingSet) dataTrainingSubset3 = dataTrainingSubset3[(classTrainingSet != 0)] classTrainingSubset3[(classTrainingSubset3 != 1)] = -1 perceptron3 = Perceptron(learningRate=0.35, iterationsToStop=850) perceptron3.learn(dataTrainingSubset3, classTrainingSubset3) multiPerceptron = MultiPerceptron(perceptron1, perceptron2, perceptron3) plot_decision_regions(X=dataTestSet, y=classTestSet, classifier=multiPerceptron) plt.xlabel(r'$x_1$') plt.ylabel(r'$x_2$') plt.title('Perceptron') plt.legend(loc='upper left') plt.show() # =============== Logistic regression ==================== classTrainingSubset1[(classTrainingSubset1 != 1)] = 0 logisticRegression1 = LogisticRegression(learningRate=0.05, iterationsToStop=1000, random_state=1) logisticRegression1.learn(dataTrainingSubset1, classTrainingSubset1) logisticRegression1.printProbability(dataTrainingSubset1) classTrainingSubset2[(classTrainingSubset2 != 1)] = 0 logisticRegression2 = LogisticRegression(learningRate=0.05, iterationsToStop=1000, random_state=1) logisticRegression2.learn(dataTrainingSubset2, classTrainingSubset2) logisticRegression2.printProbability(dataTrainingSubset2) classTrainingSubset3[(classTrainingSubset3 != 1)] = 0 logisticRegression3 = LogisticRegression(learningRate=0.15, iterationsToStop=1500, random_state=1) logisticRegression3.learn(dataTrainingSubset3, classTrainingSubset3) logisticRegression3.printProbability(dataTrainingSubset3) multiLogisticRegression = MultiLogisticRegression(logisticRegression1, logisticRegression2, logisticRegression3) plot_decision_regions(X=dataTestSet, y=classTestSet, classifier=multiLogisticRegression) plt.xlabel(r'$x_1$') plt.ylabel(r'$x_2$') plt.title('Logistic regression') plt.legend(loc='lower right') plt.show()