Ejemplo n.º 1
0
def demo2():
    print "rater demo2" 

    # 读训练集
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

     # 训练打分器
    r = CollegeEssayRater()
    r.train(trains)
    
    pkfile = open('USTC2011Jan.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()  
    
    exit()    

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan-tfidf.txt")
    tests = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
#    for p in tests:
#        s = r.rate(p)
#        p.newscore = s[0]
#        print p.id, p.score, s
#        
#    for p in tests:
#        print p.id, p.score, p.newscore
        
    print "SVM......"
    r.predict(tests)
    
    pkfile = open('ustc_test.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()  
        
    print "demo2 over!!!"
Ejemplo n.º 2
0
def demo():
    print "rater demo" 

    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

     # 训练打分器
    r = GeneralEssayRater()
    r.train(trains)
    
    pkfile = open('zhang_trains.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()  


    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
    for p in tests:
        s = r.rate(p)
        p.newscore = s[0]
        print p.id, p.score, s
        
    for p in tests:
        print p.id, p.score, p.newscore

    
    pkfile = open('zhang_tests.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()  
        
    print "demo over!!!"
Ejemplo n.º 3
0
def demo():
    print "rater demo"

    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

    # 训练打分器
    r = GeneralEssayRater()
    r.train(trains)

    pkfile = open('zhang_trains.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
    for p in tests:
        s = r.rate(p)
        p.newscore = s[0]
        print p.id, p.score, s

    for p in tests:
        print p.id, p.score, p.newscore

    pkfile = open('zhang_tests.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()

    print "demo over!!!"
Ejemplo n.º 4
0
def generateUSTCFeathers(ustcFilename, outFilename):
    essays = USTCReader.parseUSTCFile(ustcFilename)

    passages = []

    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        passages.append(passage)

    generatePassageFeathers(passages[:], outFilename)

    pkfile = open('ustcpassages_503_lt.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()
Ejemplo n.º 5
0
def generateUSTCFeathers(ustcFilename, outFilename):
    essays = USTCReader.parseUSTCFile(ustcFilename)
    
    passages = []
    
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        passages.append(passage)

    generatePassageFeathers(passages[:], outFilename)
    
    pkfile = open('ustcpassages_503_lt.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()  
Ejemplo n.º 6
0
def demo2():
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        trains.append(passage)

#    for p in trains[:30]:
#        essayprepare.processPassage(p)

    for p in trains[:100]:
        # 拆分段落
        print "+++++++++++++++++++++++"
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        pcount1 = len(paras)
        scount1 = 0
        for para in paras:
            sents = essayprepare.markedSentenceTokenize(para)
            #            for sent in sents:
            #            	print "### ", sent
            scount1 += len(sents)
        print "-----------------------"
        paras = essayprepare.para_tokenizer.tokenize(p.passage)
        pcount2 = len(paras)
        scount2 = 0
        for para in paras:
            sents = essayprepare.sent_tokenizer.tokenize(para)
            #            for sent in sents:
            #            	print "### ", sent
            scount2 += len(sents)
        if pcount1 != pcount2 or scount1 != scount2:
            print p.content
            print p.passage
        print "\n"


#    for i, p in enumerate(trains[:30]):
#    	for para in p.paragraphs:
#    		for sent in para.sentences:
#    			for token in sent.tokens:
#    				if token.isSpellError:
#    					print token.token, token.candidates
#    	for m in essays[i].findMarks():
#    		if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw':
#    			print m

#    egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}"
#    parser_evp1 = nltk.RegexpParser(egrammar_vp1)
#
#    for p in trains[:50]:
#        for para in p.paragraphs:
#            for sent in para.sentences:
#                sentence = [(token.token, token.pos) for token in sent.tokens]
#                result = parser_evp1.parse(sentence)
#                r = str(result)
#                if r.find('EVP1') > 0: print r

    print "demo2 over"
Ejemplo n.º 7
0
    passage.lf = lf
    cf = extractor.extractContentFeather(passage)
    sf = extractor.extractStructureFeather(passage)

    print 'OK'


if __name__ == "__main__":
    print "Start..."
    #sentenceCheckStatsDemo()
    #wordRepetitiveDemo()
    #demo2()
    demo_one_sentence()
    exit()

    essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
    print len(essays)

    essay = None
    for e in essays:
        if e.id == "0092":
            essay = e
            break

    # 文章
    passage = EssayPassage()
    passage.passage = essay.cleanContent()
    passage.title = essay.title
    passage.score = essay.score
    passage.id = essay.id
    passage.reviewerId = essay.reviewerId
Ejemplo n.º 8
0
 def __loadEssays(self):
     self.essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
     print len(self.essays)
     for e in self.essays:
         self.essayDict[e.id] = e
     print len(self.essayDict)
Ejemplo n.º 9
0
 def __loadEssays(self):
     self.essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
     print len(self.essays)
     for e in self.essays:
         self.essayDict[e.id] = e
     print len(self.essayDict)
Ejemplo n.º 10
0
def demo_crossvalidate_zhang():
    print "rater demo_crossvalidate_zhang"
    
    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)
    

    passages = []
    passages.extend(trains)
    passages.extend(tests)

    random.shuffle(passages)    
    
    scoreEssays = {}
    for p in passages:
        if p.score < 35: p.score = 35
        label = (int(p.score) + 2) / 5 - 4
        if label < 3: 
            label = 3
            #continue
        if label > 14: label = 14
        p.label = label
        if label not in scoreEssays:
            scoreEssays[label] = []
        scoreEssays[label].append(p)


    # cross validate

    ps = [[], [], [], [], []]
    left = []

    for k, v in scoreEssays.items():
        print k
        print len(v)
        if len(v) > 5:
            s = len(v) / 5
            for i in range(5):
                ps[i].extend(v[i*s: (i+1)*s])
            left.extend(v[5*s:])
        else:
            left.extend(v)
    for j in range(len(left)):
        ps[j % 5].append(left[j])
    
    print "data sets: "
    for v in ps:
        print len(v)
    

    for i in range(5):
        trains = []
        tests = []
        
        for j in range(5):
            if i == j:
                tests.extend(ps[j])
            else:
                trains.extend(ps[j])
        
        r = CollegeEssayRater()       
        r.train(trains)
        
        for p in tests:
            s = r.rate(p)
            p.newscore = s[0]
            print p.id, p.score, s
        
    s1 = []
    s2 = []    
    for p in passages:
        if p.label < 3: continue
        s1.append(int(p.score))
        s2.append(p.newscore)
        print p.id, p.score, p.endogScore, int(round(p.newscore)), p.score - int(round(p.newscore)), \
        p.lsaScore, p.lsaSimilarity, p.lsaSimilarityAll, p.lf.tokenCount, \
        p.lf.sentenceLengthAverage, p.lf.wordLengthAverage, p.lf.noneStopWordLengthAverage, \
        p.lf.nounRatio, p.lf.verbRatio, p.lf.adjRatio, p.lf.sentenceLengthSD, p.offsetRatio, \
        p.lf.aclWordCount, p.lf.aclWordRatio
        
    print scistats.pearsonr(s1, s2)      
    
    pkfile = open('zhang_all.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()    
        
    print "demo_crossvalidate over!!!"    
Ejemplo n.º 11
0
def demo2():
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        trains.append(passage)
        
#    for p in trains[:30]:
#        essayprepare.processPassage(p)
        
    for p in trains[:100]:
        # 拆分段落
        print "+++++++++++++++++++++++"
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        pcount1 = len(paras)
        scount1 = 0
        for para in paras:
            sents = essayprepare.markedSentenceTokenize(para)  
#            for sent in sents:
#            	print "### ", sent
            scount1 += len(sents)
        print "-----------------------"
        paras = essayprepare.para_tokenizer.tokenize(p.passage)
        pcount2 = len(paras)
        scount2 = 0
        for para in paras:
            sents = essayprepare.sent_tokenizer.tokenize(para)  
#            for sent in sents:
#            	print "### ", sent
            scount2 += len(sents)
        if pcount1 != pcount2 or scount1 != scount2:
            print p.content
            print p.passage
        print "\n"
            	
#    for i, p in enumerate(trains[:30]):
#    	for para in p.paragraphs:
#    		for sent in para.sentences:
#    			for token in sent.tokens:
#    				if token.isSpellError:
#    					print token.token, token.candidates
#    	for m in essays[i].findMarks():
#    		if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw':
#    			print m
        
#    egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}"
#    parser_evp1 = nltk.RegexpParser(egrammar_vp1)    
#
#    for p in trains[:50]:
#        for para in p.paragraphs:
#            for sent in para.sentences:
#                sentence = [(token.token, token.pos) for token in sent.tokens]
#                result = parser_evp1.parse(sentence)
#                r = str(result)
#                if r.find('EVP1') > 0: print r
        
    print "demo2 over" 
Ejemplo n.º 12
0
    passage.lf = lf
    cf = extractor.extractContentFeather(passage)
    sf = extractor.extractStructureFeather(passage)   
    
    print 'OK'


if __name__ == "__main__":
    print "Start..."  
    #sentenceCheckStatsDemo()
    #wordRepetitiveDemo()
    #demo2()
    demo_one_sentence()
    exit() 
    
    essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
    print len(essays)
    
    essay = None
    for e in essays:
        if e.id == "0092":
            essay = e
            break  
    
    # 文章
    passage = EssayPassage()
    passage.passage = essay.cleanContent()
    passage.title = essay.title
    passage.score = essay.score
    passage.id = essay.id
    passage.reviewerId = essay.reviewerId