def demo(): pkfile = open('ustcpassages_503.pkl', 'r') passages = pickle.load(pkfile) pkfile.close() print len(passages) passages.sort(cmp=lambda x, y: cmp(x.score, y.score), reverse=True) model = EssayModel() model.train(passages) print model.triGramDicts for p in passages: c = model.wordCombScore(p) print p.score, len(p.trigrams), c, c * 1.0 / len(p.trigrams) extractor = FeatherExtractor(model) extractor.extractLangFeather(passages[-1]) extractor.extractContentFeather(passages[-1]) extractor.extractStructureFeather(passages[-1])
def demo(): pkfile = open('ustcpassages_503.pkl', 'r') passages = pickle.load(pkfile) pkfile.close() print len(passages) passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True) model = EssayModel() model.train(passages) print model.triGramDicts for p in passages: c = model.wordCombScore(p) print p.score, len(p.trigrams), c, c*1.0/len(p.trigrams) extractor = FeatherExtractor(model) extractor.extractLangFeather(passages[-1]) extractor.extractContentFeather(passages[-1]) extractor.extractStructureFeather(passages[-1])
def train(self, passages): # ᅯᄂᄡᆭ■ᅫᅣᅰᅡ i = 1 for p in passages: #print "Passage ", i # ᄡᆭ■ᅫᅣᅰᅡ if not p.preprocessed: essayprepare.processPassage(p) i += 1 # ᄉ앿ᅪ passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True) model = EssayModel() model.train(passages) self.models['1'] = model #print model.triGramDicts # ᅩ£고ᅰ self.extractor = FeatherExtractor(model) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # ᅧ¦뽀ᅰᅱᄉ f = open('fs_train.txt', 'w') # 뽀ᅰᅬ endog = [] exog = [] labels = [] for p in passages: score = int(p.score) #if score > 95: score = 95 if score < 40: score = 40 endog.append(score) x = self.__getFeatherList(p) exog.append(x) labels.append(p.label) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # SVM위¢ᅥᄉᄋ #self.svm_model = svmutil.svm_train(labels, exog, '-c 3') # ᅬᅯᄏᄍ←ᅣᆪᅪᄉᄋ endog = np.array(endog) exog = np.array(exog) # print endog # print exog # self.m = np.mean(exog,axis=0) # print self.m # # T, P, e_var = PCA_svd(exog) # print T # print P # print e_var # # r, c = P.shape # print r, c # for i in xrange(11, r): # for j in xrange(0, c): # P[i, j] = 0 # print P # self.p = P # # xexog = dot(P, exog.transpose()) # print xexog # print xexog.shape # # xxexog = xexog.transpose() self.gls_model = sm.GLS(endog, exog) self.gls_model.fit()
def train(self, passages): # 预处理文章 i = 1 for p in passages: #print "Passage ", i # 处理文章 if not p.preprocessed: essayprepare.processPassage(p) i += 1 # 训练模型 passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True) model = EssayModel() model.train(passages) self.models['1'] = model #print model.triGramDicts # 提取特征 self.extractor = FeatherExtractor(model) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # 输出特征值 f = open('fs_train.txt', 'w') # 生成特征向量 endog = [] exog = [] labels = [] for p in passages: score = int(p.score) # if score > 90: score = 90 # if score < 35: score = 35 endog.append(score) x = self.__getFeatherList(p) exog.append(x) labels.append(p.label) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # SVM分类器训练 #self.svm_model = svmutil.svm_train(labels, exog, '-c 3') # 线性回归模型训练 endog = np.array(endog) exog = np.array(exog) # print endog # print exog # self.m = np.mean(exog,axis=0) # print self.m # # T, P, e_var = PCA_svd(exog) # print T # print P # print e_var # # r, c = P.shape # print r, c # for i in xrange(11, r): # for j in xrange(0, c): # P[i, j] = 0 # print P # self.p = P # # xexog = dot(P, exog.transpose()) # print xexog # print xexog.shape # # xxexog = xexog.transpose() self.gls_model = sm.GLS(endog, exog) self.gls_model.fit()