Example #1
0
def demo():
    pkfile = open('ustcpassages_503.pkl', 'r')
    passages = pickle.load(pkfile)
    pkfile.close()

    print len(passages)

    passages.sort(cmp=lambda x, y: cmp(x.score, y.score), reverse=True)

    model = EssayModel()
    model.train(passages)
    print model.triGramDicts

    for p in passages:
        c = model.wordCombScore(p)
        print p.score, len(p.trigrams), c, c * 1.0 / len(p.trigrams)

    extractor = FeatherExtractor(model)
    extractor.extractLangFeather(passages[-1])
    extractor.extractContentFeather(passages[-1])
    extractor.extractStructureFeather(passages[-1])
Example #2
0
def demo():
    pkfile = open('ustcpassages_503.pkl', 'r')
    passages = pickle.load(pkfile)
    pkfile.close()
    
    print len(passages)
    
    passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True)
    
    model = EssayModel()
    model.train(passages)
    print model.triGramDicts
    
    for p in passages:
        c = model.wordCombScore(p)
        print p.score, len(p.trigrams), c, c*1.0/len(p.trigrams)
        
    extractor = FeatherExtractor(model)
    extractor.extractLangFeather(passages[-1])
    extractor.extractContentFeather(passages[-1])
    extractor.extractStructureFeather(passages[-1]) 
Example #3
0
    def train(self, passages):
        # ᅯᄂᄡᆭ￀■ᅫᅣᅰᅡ
        i = 1
        for p in passages:
            #print "Passage ", i
            # ᄡᆭ￀■ᅫᅣᅰᅡ
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1
        
        # ￑ᄉ￁앿￐ᅪ
        passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True)
        
        model = EssayModel()
        model.train(passages)
        self.models['1'] = model
        #print model.triGramDicts
        
        # ᅩ£￈고￘ᅰ￷
        self.extractor = FeatherExtractor(model)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)   
        
        # ᅧ¦ᄈ￶ᅩ￘ᅰ￷ᅱᄉ
        f = open('fs_train.txt', 'w')
        
        # ￉ᄈ￉ᅩ￘ᅰ￷ᅬ￲￁﾿
        endog = []
        exog = []
        labels = []
        for p in passages:
            score = int(p.score)
            #if score > 95: score = 95
            if score < 40: score = 40
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)

            labels.append(p.label)
            
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        
        f.close()       
        
        # SVM위￀¢ᅥ￷￑ᄉ￁ᄋ
        #self.svm_model = svmutil.svm_train(labels, exog, '-c 3')
        
        # ᅬ￟￐ᅯᄏ￘ᄍ←ᅣᆪ￐ᅪ￑ᄉ￁ᄋ  
        endog = np.array(endog)
        exog = np.array(exog)
#        print endog
#        print exog
        
#        self.m = np.mean(exog,axis=0)
#        print self.m
#        
#        T, P, e_var = PCA_svd(exog)   
#        print T
#        print P
#        print e_var
#        
#        r, c = P.shape
#        print r, c
#        for i in xrange(11, r):
#            for j in xrange(0, c):
#                P[i, j] = 0
#        print P
#        self.p = P
#        
#        xexog = dot(P, exog.transpose())
#        print xexog
#        print xexog.shape
#        
#        xxexog = xexog.transpose() 
        
        self.gls_model = sm.GLS(endog, exog)
        self.gls_model.fit()
Example #4
0
    def train(self, passages):
        # 预处理文章
        i = 1
        for p in passages:
            #print "Passage ", i
            # 处理文章
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1
        
        # 训练模型
        passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True)
        
        model = EssayModel()
        model.train(passages)
        self.models['1'] = model
        #print model.triGramDicts
        
        # 提取特征
        self.extractor = FeatherExtractor(model)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)   
        
        # 输出特征值
        f = open('fs_train.txt', 'w')
        
        # 生成特征向量
        endog = []
        exog = []
        labels = []
        for p in passages:
            score = int(p.score)
#            if score > 90: score = 90
#            if score < 35: score = 35
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)

            labels.append(p.label)
            
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        
        f.close()       
        
        # SVM分类器训练
        #self.svm_model = svmutil.svm_train(labels, exog, '-c 3')
        
        # 线性回归模型训练  
        endog = np.array(endog)
        exog = np.array(exog)
#        print endog
#        print exog
        
#        self.m = np.mean(exog,axis=0)
#        print self.m
#        
#        T, P, e_var = PCA_svd(exog)   
#        print T
#        print P
#        print e_var
#        
#        r, c = P.shape
#        print r, c
#        for i in xrange(11, r):
#            for j in xrange(0, c):
#                P[i, j] = 0
#        print P
#        self.p = P
#        
#        xexog = dot(P, exog.transpose())
#        print xexog
#        print xexog.shape
#        
#        xxexog = xexog.transpose() 
        
        self.gls_model = sm.GLS(endog, exog)
        self.gls_model.fit()