Exemple #1
0
def wordRepetitiveDemo():
    print "wordRepetitiveDemo start..."
    pkfile = open('ustcpassages_503.pkl', 'r')
    passages = pickle.load(pkfile)
    pkfile.close()

    f = FeatherExtractor()
    for p in passages[:]:
        lf = f.extractLangFeather(p)
        p.lf = lf

    f = open('wordrep.txt', 'w')
    for p in passages[:]:
        if p.lf:
            for l in p.lf.lemmaUseInfo:
                print p.id, p.score, l[0], l[1], l[2], l[3], l[4], l[5]
                s = ' '.join([
                    str(p.id),
                    str(p.score),
                    str(p.lf.overlyUseWordCount), l[0],
                    str(l[1]),
                    str(l[2]),
                    str(l[3]),
                    str(l[4]),
                    str(l[5])
                ])
                f.write(s)
                f.write('\n')
    f.close()

    print "wordRepetitiveDemo over!!!"
Exemple #2
0
    def rate_by_params(self, passage):
        # 线性预测
        extractor = FeatherExtractor(None)
        if not passage.preprocessed: essayprepare.processPassage(passage)
        passage.lf = extractor.extractLangFeather(passage)
        passage.cf = extractor.extractContentFeather(passage)
        passage.sf = extractor.extractStructureFeather(passage)

        exog = []
        x = self.__getFeatherList(passage)

        score = dot(x, self.model_params)

        passage.rateScore = score
        passage.endogScore = score

        # 调整分数
        passage.filter_scores = []
        filters = [
            self.tokenCountFilter, self.sentenceLengthAverageFilter,
            self.wordLengthAverageFilter, self.aclWordCountFilter,
            self.noneStopWordLengthAverageFilter, self.nounRatioFilter,
            self.total_score_filter
        ]

        for filter in filters:
            filter_score = filter(passage)
            passage.rateScore += filter_score
            passage.filter_scores.append(filter_score)

        passage.rated = True
        return [passage.rateScore]
    def train(self, passages):
        # pre-process passage
        i = 1
        for p in passages:
            print "======================="
            print "Passage", i, p.id
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1

        self.extractor = FeatherExtractor(None)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)

        # save feathers
        f = open('fs_zhang_train.txt', 'w')
        for p in passages:
            x = self.__getFeatherList(p)
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        f.close()

        # generate feather vector
        endog = []
        exog = []
        for p in passages:
            score = int(p.score)
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)

        # train model
        endog = np.array(endog)
        exog = np.array(exog)

        self.gls_model = sm.GLS(endog, exog)
        results = self.gls_model.fit()
        #print results.summary()
        print results.params
Exemple #4
0
def demo_one_sentence():
    # 文章
    passage = EssayPassage()
    passage.passage = 'I am a students.'
    passage.title = 'title'
    passage.score = 5
    passage.id = '1'
    passage.reviewerId = 3
    passage.content = 'I am a students.'

    # 处理文章
    essayprepare.processPassage(passage)

    extractor = FeatherExtractor()
    lf = extractor.extractLangFeather(passage)
    passage.lf = lf
    cf = extractor.extractContentFeather(passage)
    sf = extractor.extractStructureFeather(passage)

    print 'OK'
Exemple #5
0
def generatePassageFeathers(passages, outFilename):
    f = open(outFilename, 'w')

    e = FeatherExtractor()

    i = 1

    for p in passages:
        print "Passage ", i
        # 处理文章
        essayprepare.processPassage(p)
        # 提取语言特征
        languageFeather = e.extractLangFeather(p)
        p.lf = languageFeather
        # 提取结构特征
        structureFeather = e.extractStructureFeather(p)
        p.sf = structureFeather

        f.write(p.id + ' ')
        f.write(str(p.score))
        f.write(' ' + str(languageFeather))
        f.write('\n')
        i += 1
    f.close()
Exemple #6
0
            print "SPELLERROR", errorTokens
            print tokens
            print tags
            print lemmas
            print stems
            print levels
            print sentNos
            print paraNos
            print nos
            print sent.tokenCount
            print sent.wordCount
            print sent.realWordCount

    print "三元词组", passage.trigrams

    e = FeatherExtractor()

    # 提取语言特征
    languageFeather = e.extractLangFeather(passage)

    print u"词次总数", languageFeather.tokenCount
    print u"单词总数", languageFeather.wordCount
    print u"词形总数", languageFeather.wordTypeCount
    print u"词元总数", languageFeather.wordLemmaCount

    print u"介词个数", languageFeather.prepositionCount
    print u"介词比例", languageFeather.prepositionRatio
    print u"介词使用", languageFeather.prepositionUse

    print u"定冠词个数", languageFeather.definiteArticleCount
    print u"定冠词比例", languageFeather.definiteArticleRatio
Exemple #7
0
    def processEssay(self):
        self.browser.clear()
        id = unicode(self.lineedit.text())
        essay = self.essayDict.get(id)
        if not essay:
            self.browser.append("<font color=red>%s is not found!</font>" % id)
            return

        self.browser.append(essay.content)

        # 文章
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id

        # 处理文章
        essayprepare.processPassage(passage)

        # 输出来看看是啥样子
        self.browser.append("PASSAGE=========================================")
        self.browser.append(passage.id)
        #self.browser.append(passage.title)
        self.browser.append(passage.score)
        self.browser.append(passage.passage)
        self.browser.append(str(len(passage.paragraphs)))
        self.browser.append(
            "PARAGRAPHS---------------------------------------")
        for para in passage.paragraphs:
            self.browser.append(str(para.paragraphNo))
            self.browser.append(para.paragraph)
            for sent in para.sentences:
                self.browser.append(str(sent.sentenceNo))
                self.browser.append(str(sent.paragraphSentenceNo))
                self.browser.append(sent.sentence)
                tokens = [token.token for token in sent.tokens]
                tags = [token.pos for token in sent.tokens]
                lemmas = [token.lemma for token in sent.tokens]
                stems = [token.stem for token in sent.tokens]
                levels = [token.level for token in sent.tokens]
                nos = [token.tokenNo for token in sent.tokens]
                sentNos = [token.sentenceTokenNo for token in sent.tokens]
                paraNos = [token.paragraphTokenNo for token in sent.tokens]
                errorTokens = [
                    token.token for token in sent.tokens if token.isSpellError
                ]
                if not sent.canParsed:
                    self.browser.append(
                        "<font color=red>SENTENCE ERROR</font>")
                self.browser.append("<font color=red>SPELLERROR %s</font>" %
                                    str(errorTokens))
                self.browser.append(str(tokens))
                self.browser.append(str(tags))
                self.browser.append(str(lemmas))
                self.browser.append(str(stems))
                self.browser.append(str(levels))
                self.browser.append(str(sentNos))
                self.browser.append(str(paraNos))
                self.browser.append(str(nos))
                self.browser.append(str(sent.tokenCount))
                self.browser.append(str(sent.wordCount))
                self.browser.append(str(sent.realWordCount))

        self.browser.append(u"三元词组" + ' ' + str(passage.trigrams))

        e = FeatherExtractor()

        # 提取语言特征
        languageFeather = e.extractLangFeather(passage)

        print u"词次总数", languageFeather.tokenCount
        print u"单词总数", languageFeather.wordCount
        print u"词形总数", languageFeather.wordTypeCount
        print u"词元总数", languageFeather.wordLemmaCount

        print u"介词个数", languageFeather.prepositionCount
        print u"介词比例", languageFeather.prepositionRatio
        print u"介词使用", languageFeather.prepositionUse

        print u"定冠词个数", languageFeather.definiteArticleCount
        print u"定冠词比例", languageFeather.definiteArticleRatio
        print u"定冠词使用", languageFeather.definiteArticleUse

        # 提取结构特征
        #structureFeather = e.extractStructureFeather(passage)

        #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt')

        print "...OVER"
Exemple #8
0
    def train(self, passages):
        # ᅯᄂᄡᆭ￀■ᅫᅣᅰᅡ
        i = 1
        for p in passages:
            #print "Passage ", i
            # ᄡᆭ￀■ᅫᅣᅰᅡ
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1
        
        # ￑ᄉ￁앿￐ᅪ
        passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True)
        
        model = EssayModel()
        model.train(passages)
        self.models['1'] = model
        #print model.triGramDicts
        
        # ᅩ£￈고￘ᅰ￷
        self.extractor = FeatherExtractor(model)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)   
        
        # ᅧ¦ᄈ￶ᅩ￘ᅰ￷ᅱᄉ
        f = open('fs_train.txt', 'w')
        
        # ￉ᄈ￉ᅩ￘ᅰ￷ᅬ￲￁﾿
        endog = []
        exog = []
        labels = []
        for p in passages:
            score = int(p.score)
            #if score > 95: score = 95
            if score < 40: score = 40
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)

            labels.append(p.label)
            
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        
        f.close()       
        
        # SVM위￀¢ᅥ￷￑ᄉ￁ᄋ
        #self.svm_model = svmutil.svm_train(labels, exog, '-c 3')
        
        # ᅬ￟￐ᅯᄏ￘ᄍ←ᅣᆪ￐ᅪ￑ᄉ￁ᄋ  
        endog = np.array(endog)
        exog = np.array(exog)
#        print endog
#        print exog
        
#        self.m = np.mean(exog,axis=0)
#        print self.m
#        
#        T, P, e_var = PCA_svd(exog)   
#        print T
#        print P
#        print e_var
#        
#        r, c = P.shape
#        print r, c
#        for i in xrange(11, r):
#            for j in xrange(0, c):
#                P[i, j] = 0
#        print P
#        self.p = P
#        
#        xexog = dot(P, exog.transpose())
#        print xexog
#        print xexog.shape
#        
#        xxexog = xexog.transpose() 
        
        self.gls_model = sm.GLS(endog, exog)
        self.gls_model.fit()
Exemple #9
0
    def train(self, passages):
        # 预处理文章
        i = 1
        for p in passages:
            #print "Passage ", i
            # 处理文章
            if not p.preprocessed: essayprepare.processPassage(p)
            i += 1
        
        # 训练模型
        passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True)
        
        model = EssayModel()
        model.train(passages)
        self.models['1'] = model
        #print model.triGramDicts
        
        # 提取特征
        self.extractor = FeatherExtractor(model)
        for p in passages:
            p.lf = self.extractor.extractLangFeather(p)
            p.cf = self.extractor.extractContentFeather(p)
            p.sf = self.extractor.extractStructureFeather(p)   
        
        # 输出特征值
        f = open('fs_train.txt', 'w')
        
        # 生成特征向量
        endog = []
        exog = []
        labels = []
        for p in passages:
            score = int(p.score)
#            if score > 90: score = 90
#            if score < 35: score = 35
            endog.append(score)
            x = self.__getFeatherList(p)
            exog.append(x)

            labels.append(p.label)
            
            f.write(p.id + ' ')
            f.write(str(p.score))
            for v in x:
                f.write(' ' + str(v))
            f.write('\n')
        
        f.close()       
        
        # SVM分类器训练
        #self.svm_model = svmutil.svm_train(labels, exog, '-c 3')
        
        # 线性回归模型训练  
        endog = np.array(endog)
        exog = np.array(exog)
#        print endog
#        print exog
        
#        self.m = np.mean(exog,axis=0)
#        print self.m
#        
#        T, P, e_var = PCA_svd(exog)   
#        print T
#        print P
#        print e_var
#        
#        r, c = P.shape
#        print r, c
#        for i in xrange(11, r):
#            for j in xrange(0, c):
#                P[i, j] = 0
#        print P
#        self.p = P
#        
#        xexog = dot(P, exog.transpose())
#        print xexog
#        print xexog.shape
#        
#        xxexog = xexog.transpose() 
        
        self.gls_model = sm.GLS(endog, exog)
        self.gls_model.fit()