コード例 #1
0
ファイル: rater.py プロジェクト: yaokeepmoving/ustc-offTopic
def demo2():
    print "rater demo2" 

    # 读训练集
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

     # 训练打分器
    r = CollegeEssayRater()
    r.train(trains)
    
    pkfile = open('USTC2011Jan.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()  
    
    exit()    

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan-tfidf.txt")
    tests = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
#    for p in tests:
#        s = r.rate(p)
#        p.newscore = s[0]
#        print p.id, p.score, s
#        
#    for p in tests:
#        print p.id, p.score, p.newscore
        
    print "SVM......"
    r.predict(tests)
    
    pkfile = open('ustc_test.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()  
        
    print "demo2 over!!!"
コード例 #2
0
def demo():
    print "rater demo" 

    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

     # 训练打分器
    r = GeneralEssayRater()
    r.train(trains)
    
    pkfile = open('zhang_trains.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()  


    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
    for p in tests:
        s = r.rate(p)
        p.newscore = s[0]
        print p.id, p.score, s
        
    for p in tests:
        print p.id, p.score, p.newscore

    
    pkfile = open('zhang_tests.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()  
        
    print "demo over!!!"
コード例 #3
0
def demo():
    print "rater demo"

    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

    # 训练打分器
    r = GeneralEssayRater()
    r.train(trains)

    pkfile = open('zhang_trains.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
    for p in tests:
        s = r.rate(p)
        p.newscore = s[0]
        print p.id, p.score, s

    for p in tests:
        print p.id, p.score, p.newscore

    pkfile = open('zhang_tests.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()

    print "demo over!!!"
コード例 #4
0
def demo_one():
    content = """At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places."""

    # 文章
    passage = EssayPassage()
    passage.passage = content
    passage.title = 'title'
    passage.score = 5
    passage.id = '1'
    passage.reviewerId = 3
    passage.content = content

    r = SimpleEssayRater()
    s = r.rate_by_params(passage)
    passage.newscore = s[0]
    print passage.id, passage.score, s

    print 'OK'
コード例 #5
0
ファイル: simplerater.py プロジェクト: kangkona/ustc-offTopic
def demo_one():
    content = """At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places."""

    # 文章
    passage = EssayPassage()
    passage.passage = content
    passage.title = 'title'
    passage.score = 5
    passage.id = '1'
    passage.reviewerId = 3
    passage.content = content
       
    r = SimpleEssayRater()
    s = r.rate_by_params(passage)
    passage.newscore = s[0]
    print passage.id, passage.score, s
    
    print 'OK'
コード例 #6
0
def generateUSTCFeathers(ustcFilename, outFilename):
    essays = USTCReader.parseUSTCFile(ustcFilename)

    passages = []

    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        passages.append(passage)

    generatePassageFeathers(passages[:], outFilename)

    pkfile = open('ustcpassages_503_lt.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()
コード例 #7
0
def demo_one_sentence():
    # 文章
    passage = EssayPassage()
    passage.passage = 'I am a students.'
    passage.title = 'title'
    passage.score = 5
    passage.id = '1'
    passage.reviewerId = 3
    passage.content = 'I am a students.'

    # 处理文章
    essayprepare.processPassage(passage)

    extractor = FeatherExtractor()
    lf = extractor.extractLangFeather(passage)
    passage.lf = lf
    cf = extractor.extractContentFeather(passage)
    sf = extractor.extractStructureFeather(passage)

    print 'OK'
コード例 #8
0
ファイル: prepareUSTC.py プロジェクト: kangkona/ustc-offTopic
def generateUSTCFeathers(ustcFilename, outFilename):
    essays = USTCReader.parseUSTCFile(ustcFilename)
    
    passages = []
    
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        passages.append(passage)

    generatePassageFeathers(passages[:], outFilename)
    
    pkfile = open('ustcpassages_503_lt.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()  
コード例 #9
0
ファイル: prepareUSTC.py プロジェクト: kangkona/ustc-offTopic
def demo_one_sentence():
    # 文章
    passage = EssayPassage()
    passage.passage = 'I am a students.'
    passage.title = 'title'
    passage.score = 5
    passage.id = '1'
    passage.reviewerId = 3
    passage.content = 'I am a students.'
    
    # 处理文章
    essayprepare.processPassage(passage)
    
    extractor = FeatherExtractor()
    lf = extractor.extractLangFeather(passage)
    passage.lf = lf
    cf = extractor.extractContentFeather(passage)
    sf = extractor.extractStructureFeather(passage)   
    
    print 'OK'
コード例 #10
0
def demo2():
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        trains.append(passage)

#    for p in trains[:30]:
#        essayprepare.processPassage(p)

    for p in trains[:100]:
        # 拆分段落
        print "+++++++++++++++++++++++"
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        pcount1 = len(paras)
        scount1 = 0
        for para in paras:
            sents = essayprepare.markedSentenceTokenize(para)
            #            for sent in sents:
            #            	print "### ", sent
            scount1 += len(sents)
        print "-----------------------"
        paras = essayprepare.para_tokenizer.tokenize(p.passage)
        pcount2 = len(paras)
        scount2 = 0
        for para in paras:
            sents = essayprepare.sent_tokenizer.tokenize(para)
            #            for sent in sents:
            #            	print "### ", sent
            scount2 += len(sents)
        if pcount1 != pcount2 or scount1 != scount2:
            print p.content
            print p.passage
        print "\n"


#    for i, p in enumerate(trains[:30]):
#    	for para in p.paragraphs:
#    		for sent in para.sentences:
#    			for token in sent.tokens:
#    				if token.isSpellError:
#    					print token.token, token.candidates
#    	for m in essays[i].findMarks():
#    		if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw':
#    			print m

#    egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}"
#    parser_evp1 = nltk.RegexpParser(egrammar_vp1)
#
#    for p in trains[:50]:
#        for para in p.paragraphs:
#            for sent in para.sentences:
#                sentence = [(token.token, token.pos) for token in sent.tokens]
#                result = parser_evp1.parse(sentence)
#                r = str(result)
#                if r.find('EVP1') > 0: print r

    print "demo2 over"
コード例 #11
0
    demo_one_sentence()
    exit()

    essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
    print len(essays)

    essay = None
    for e in essays:
        if e.id == "0092":
            essay = e
            break

    # 文章
    passage = EssayPassage()
    passage.passage = essay.cleanContent()
    passage.title = essay.title
    passage.score = essay.score
    passage.id = essay.id
    passage.reviewerId = essay.reviewerId
    passage.content = essay.content

    # 处理文章
    essayprepare.processPassage(passage)

    # 输出来看看是啥样子
    print "PASSAGE========================================="
    print passage
    print passage.id
    print passage.title
    print passage.score
    print passage.passage
コード例 #12
0
 def processEssay(self):
     self.browser.clear()
     id = unicode(self.lineedit.text())
     essay = self.essayDict.get(id)
     if not essay:
         self.browser.append("<font color=red>%s is not found!</font>" % id)
         return
     
     self.browser.append(essay.content)
     
     # 文章
     passage = EssayPassage()
     passage.passage = essay.cleanContent()
     passage.title = essay.title
     passage.score = essay.score
     passage.id = essay.id
     
     # 处理文章
     essayprepare.processPassage(passage)
     
     # 输出来看看是啥样子    
     self.browser.append("PASSAGE=========================================")        
     self.browser.append(passage.id)
     #self.browser.append(passage.title)
     self.browser.append(passage.score)
     self.browser.append(passage.passage)
     self.browser.append(str(len(passage.paragraphs)))
     self.browser.append("PARAGRAPHS---------------------------------------")
     for para in passage.paragraphs:
         self.browser.append(str(para.paragraphNo))
         self.browser.append(para.paragraph)
         for sent in para.sentences:
             self.browser.append(str(sent.sentenceNo))
             self.browser.append(str(sent.paragraphSentenceNo))
             self.browser.append(sent.sentence)
             tokens = [token.token for token in sent.tokens]
             tags = [token.pos for token in sent.tokens]
             lemmas = [token.lemma for token in sent.tokens]
             stems = [token.stem for token in sent.tokens]
             levels = [token.level for token in sent.tokens]
             nos = [token.tokenNo for token in sent.tokens]
             sentNos = [token.sentenceTokenNo for token in sent.tokens]
             paraNos = [token.paragraphTokenNo for token in sent.tokens]
             errorTokens = [token.token for token in sent.tokens if token.isSpellError]
             if not sent.canParsed:
                 self.browser.append("<font color=red>SENTENCE ERROR</font>")
             self.browser.append("<font color=red>SPELLERROR %s</font>" % str(errorTokens))
             self.browser.append(str(tokens))
             self.browser.append(str(tags))
             self.browser.append(str(lemmas))
             self.browser.append(str(stems))
             self.browser.append(str(levels))
             self.browser.append(str(sentNos))
             self.browser.append(str(paraNos))
             self.browser.append(str(nos))
             self.browser.append(str(sent.tokenCount))
             self.browser.append(str(sent.wordCount))
             self.browser.append(str(sent.realWordCount))
     
     self.browser.append(u"三元词组" + ' ' + str(passage.trigrams))
 
 
     e = FeatherExtractor()
 
     # 提取语言特征    
     languageFeather = e.extractLangFeather(passage)  
     
     print u"词次总数", languageFeather.tokenCount
     print u"单词总数", languageFeather.wordCount
     print u"词形总数", languageFeather.wordTypeCount
     print u"词元总数", languageFeather.wordLemmaCount
     
     print u"介词个数", languageFeather.prepositionCount
     print u"介词比例", languageFeather.prepositionRatio
     print u"介词使用", languageFeather.prepositionUse
     
     print u"定冠词个数", languageFeather.definiteArticleCount
     print u"定冠词比例", languageFeather.definiteArticleRatio
     print u"定冠词使用", languageFeather.definiteArticleUse
     
     # 提取结构特征  
     #structureFeather = e.extractStructureFeather(passage)
     
     #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt')
         
     print "...OVER"
コード例 #13
0
    def processEssay(self):
        self.browser.clear()
        id = unicode(self.lineedit.text())
        essay = self.essayDict.get(id)
        if not essay:
            self.browser.append("<font color=red>%s is not found!</font>" % id)
            return

        self.browser.append(essay.content)

        # 文章
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id

        # 处理文章
        essayprepare.processPassage(passage)

        # 输出来看看是啥样子
        self.browser.append("PASSAGE=========================================")
        self.browser.append(passage.id)
        #self.browser.append(passage.title)
        self.browser.append(passage.score)
        self.browser.append(passage.passage)
        self.browser.append(str(len(passage.paragraphs)))
        self.browser.append(
            "PARAGRAPHS---------------------------------------")
        for para in passage.paragraphs:
            self.browser.append(str(para.paragraphNo))
            self.browser.append(para.paragraph)
            for sent in para.sentences:
                self.browser.append(str(sent.sentenceNo))
                self.browser.append(str(sent.paragraphSentenceNo))
                self.browser.append(sent.sentence)
                tokens = [token.token for token in sent.tokens]
                tags = [token.pos for token in sent.tokens]
                lemmas = [token.lemma for token in sent.tokens]
                stems = [token.stem for token in sent.tokens]
                levels = [token.level for token in sent.tokens]
                nos = [token.tokenNo for token in sent.tokens]
                sentNos = [token.sentenceTokenNo for token in sent.tokens]
                paraNos = [token.paragraphTokenNo for token in sent.tokens]
                errorTokens = [
                    token.token for token in sent.tokens if token.isSpellError
                ]
                if not sent.canParsed:
                    self.browser.append(
                        "<font color=red>SENTENCE ERROR</font>")
                self.browser.append("<font color=red>SPELLERROR %s</font>" %
                                    str(errorTokens))
                self.browser.append(str(tokens))
                self.browser.append(str(tags))
                self.browser.append(str(lemmas))
                self.browser.append(str(stems))
                self.browser.append(str(levels))
                self.browser.append(str(sentNos))
                self.browser.append(str(paraNos))
                self.browser.append(str(nos))
                self.browser.append(str(sent.tokenCount))
                self.browser.append(str(sent.wordCount))
                self.browser.append(str(sent.realWordCount))

        self.browser.append(u"三元词组" + ' ' + str(passage.trigrams))

        e = FeatherExtractor()

        # 提取语言特征
        languageFeather = e.extractLangFeather(passage)

        print u"词次总数", languageFeather.tokenCount
        print u"单词总数", languageFeather.wordCount
        print u"词形总数", languageFeather.wordTypeCount
        print u"词元总数", languageFeather.wordLemmaCount

        print u"介词个数", languageFeather.prepositionCount
        print u"介词比例", languageFeather.prepositionRatio
        print u"介词使用", languageFeather.prepositionUse

        print u"定冠词个数", languageFeather.definiteArticleCount
        print u"定冠词比例", languageFeather.definiteArticleRatio
        print u"定冠词使用", languageFeather.definiteArticleUse

        # 提取结构特征
        #structureFeather = e.extractStructureFeather(passage)

        #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt')

        print "...OVER"
コード例 #14
0
ファイル: rater.py プロジェクト: yaokeepmoving/ustc-offTopic
def demo_crossvalidate_zhang():
    print "rater demo_crossvalidate_zhang"
    
    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)
    

    passages = []
    passages.extend(trains)
    passages.extend(tests)

    random.shuffle(passages)    
    
    scoreEssays = {}
    for p in passages:
        if p.score < 35: p.score = 35
        label = (int(p.score) + 2) / 5 - 4
        if label < 3: 
            label = 3
            #continue
        if label > 14: label = 14
        p.label = label
        if label not in scoreEssays:
            scoreEssays[label] = []
        scoreEssays[label].append(p)


    # cross validate

    ps = [[], [], [], [], []]
    left = []

    for k, v in scoreEssays.items():
        print k
        print len(v)
        if len(v) > 5:
            s = len(v) / 5
            for i in range(5):
                ps[i].extend(v[i*s: (i+1)*s])
            left.extend(v[5*s:])
        else:
            left.extend(v)
    for j in range(len(left)):
        ps[j % 5].append(left[j])
    
    print "data sets: "
    for v in ps:
        print len(v)
    

    for i in range(5):
        trains = []
        tests = []
        
        for j in range(5):
            if i == j:
                tests.extend(ps[j])
            else:
                trains.extend(ps[j])
        
        r = CollegeEssayRater()       
        r.train(trains)
        
        for p in tests:
            s = r.rate(p)
            p.newscore = s[0]
            print p.id, p.score, s
        
    s1 = []
    s2 = []    
    for p in passages:
        if p.label < 3: continue
        s1.append(int(p.score))
        s2.append(p.newscore)
        print p.id, p.score, p.endogScore, int(round(p.newscore)), p.score - int(round(p.newscore)), \
        p.lsaScore, p.lsaSimilarity, p.lsaSimilarityAll, p.lf.tokenCount, \
        p.lf.sentenceLengthAverage, p.lf.wordLengthAverage, p.lf.noneStopWordLengthAverage, \
        p.lf.nounRatio, p.lf.verbRatio, p.lf.adjRatio, p.lf.sentenceLengthSD, p.offsetRatio, \
        p.lf.aclWordCount, p.lf.aclWordRatio
        
    print scistats.pearsonr(s1, s2)      
    
    pkfile = open('zhang_all.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()    
        
    print "demo_crossvalidate over!!!"    
コード例 #15
0
ファイル: prepareUSTC.py プロジェクト: kangkona/ustc-offTopic
def demo2():
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        trains.append(passage)
        
#    for p in trains[:30]:
#        essayprepare.processPassage(p)
        
    for p in trains[:100]:
        # 拆分段落
        print "+++++++++++++++++++++++"
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        pcount1 = len(paras)
        scount1 = 0
        for para in paras:
            sents = essayprepare.markedSentenceTokenize(para)  
#            for sent in sents:
#            	print "### ", sent
            scount1 += len(sents)
        print "-----------------------"
        paras = essayprepare.para_tokenizer.tokenize(p.passage)
        pcount2 = len(paras)
        scount2 = 0
        for para in paras:
            sents = essayprepare.sent_tokenizer.tokenize(para)  
#            for sent in sents:
#            	print "### ", sent
            scount2 += len(sents)
        if pcount1 != pcount2 or scount1 != scount2:
            print p.content
            print p.passage
        print "\n"
            	
#    for i, p in enumerate(trains[:30]):
#    	for para in p.paragraphs:
#    		for sent in para.sentences:
#    			for token in sent.tokens:
#    				if token.isSpellError:
#    					print token.token, token.candidates
#    	for m in essays[i].findMarks():
#    		if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw':
#    			print m
        
#    egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}"
#    parser_evp1 = nltk.RegexpParser(egrammar_vp1)    
#
#    for p in trains[:50]:
#        for para in p.paragraphs:
#            for sent in para.sentences:
#                sentence = [(token.token, token.pos) for token in sent.tokens]
#                result = parser_evp1.parse(sentence)
#                r = str(result)
#                if r.find('EVP1') > 0: print r
        
    print "demo2 over" 
コード例 #16
0
ファイル: prepareUSTC.py プロジェクト: kangkona/ustc-offTopic
 demo_one_sentence()
 exit() 
 
 essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
 print len(essays)
 
 essay = None
 for e in essays:
     if e.id == "0092":
         essay = e
         break  
 
 # 文章
 passage = EssayPassage()
 passage.passage = essay.cleanContent()
 passage.title = essay.title
 passage.score = essay.score
 passage.id = essay.id
 passage.reviewerId = essay.reviewerId
 passage.content = essay.content
 
 # 处理文章
 essayprepare.processPassage(passage)
 
 # 输出来看看是啥样子    
 print "PASSAGE========================================="        
 print passage
 print passage.id
 print passage.title
 print passage.score
 print passage.passage