Exemple #1
0
def demo2():
    print "rater demo2" 

    # 读训练集
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

     # 训练打分器
    r = CollegeEssayRater()
    r.train(trains)
    
    pkfile = open('USTC2011Jan.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()  
    
    exit()    

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan-tfidf.txt")
    tests = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
#    for p in tests:
#        s = r.rate(p)
#        p.newscore = s[0]
#        print p.id, p.score, s
#        
#    for p in tests:
#        print p.id, p.score, p.newscore
        
    print "SVM......"
    r.predict(tests)
    
    pkfile = open('ustc_test.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()  
        
    print "demo2 over!!!"
def demo():
    print "rater demo" 

    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

     # 训练打分器
    r = GeneralEssayRater()
    r.train(trains)
    
    pkfile = open('zhang_trains.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()  


    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
    for p in tests:
        s = r.rate(p)
        p.newscore = s[0]
        print p.id, p.score, s
        
    for p in tests:
        print p.id, p.score, p.newscore

    
    pkfile = open('zhang_tests.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()  
        
    print "demo over!!!"
def demo():
    print "rater demo"

    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

    # 训练打分器
    r = GeneralEssayRater()
    r.train(trains)

    pkfile = open('zhang_trains.pkl', 'w')
    pickle.dump(trains, pkfile)
    pkfile.close()

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        tests.append(passage)

    # 打分测试
    for p in tests:
        s = r.rate(p)
        p.newscore = s[0]
        print p.id, p.score, s

    for p in tests:
        print p.id, p.score, p.newscore

    pkfile = open('zhang_tests.pkl', 'w')
    pickle.dump(tests, pkfile)
    pkfile.close()

    print "demo over!!!"
Exemple #4
0
def generateUSTCFeathers(ustcFilename, outFilename):
    essays = USTCReader.parseUSTCFile(ustcFilename)

    passages = []

    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        passages.append(passage)

    generatePassageFeathers(passages[:], outFilename)

    pkfile = open('ustcpassages_503_lt.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()
def generateUSTCFeathers(ustcFilename, outFilename):
    essays = USTCReader.parseUSTCFile(ustcFilename)
    
    passages = []
    
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        passages.append(passage)

    generatePassageFeathers(passages[:], outFilename)
    
    pkfile = open('ustcpassages_503_lt.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()  
Exemple #6
0
def sentenceCheckStatsDemo():
    print "sentenceCheckStatsDemo start..."
    pkfile = open('ustcpassages_503_lt.pkl', 'r')
    passages = pickle.load(pkfile)
    pkfile.close()

    sentCount = 0
    errorcount = 0
    lgcorrect = 0
    lgtotal = 0
    ltcorrect = 0
    lttotal = 0
    allcorrect = 0

    for p in passages:
        pltc = 0
        osents = []
        for para in p.paragraphs:
            osents.extend(para.sentences)

        msents = []
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        for para in paras:
            msents.extend(essayprepare.markedSentenceTokenize(para))

        if len(osents) != len(msents):
            print "sentence count not equal", p.id
            print osents
            print msents
            continue

        for si, os in enumerate(osents):
            ms = msents[si]
            mkerror = 1
            lgerror = 1
            lterror = 1
            ltc = 0

            marks = USTCReader.findMarks(ms)
            onlysperror = True
            for mark in marks:
                if not mark[0] in ['fm1', 'fm2', 'sw']:
                    onlysperror = False
                    break
            if onlysperror: mkerror = 0
            #if ms.find('[') < 0 and ms.find(']') < 0:
            #    mkerror = 0
            if os.canParsed:
                lgerror = 0
            if len(os.ltCheckResults) == 0:
                lterror = 0
            else:
                ltc = len(os.ltCheckResults)
                for cr in os.ltCheckResults:
                    if cr['ruleId'] == 'WHITESPACE_RULE':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'COMMA_PARENTHESIS_WHITESPACE':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'UPPERCASE_SENTENCE_START':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'CAN_NOT':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'EN_QUOTES':
                        ltc = ltc - 1
                if ltc == 0:
                    lterror = 0

            sentCount += 1
            if mkerror == 1: errorcount += 1
            if lgerror == 1:
                lgtotal += 1
                if lgerror == mkerror:
                    lgcorrect += 1
            if lterror == 1:
                lttotal += 1
                if lterror == mkerror:
                    ltcorrect += 1
                    if lterror == lgerror:
                        allcorrect += 1
            pltc += ltc
            #print p.id, p.score, len(os.tokens), mkerror, lgerror, lterror, ltc


#            print ms #, #ms, os.sentence
#            print os.sentence
#            if len(os.ltCheckResults) > 0:
#                for cr in os.ltCheckResults:
#                    print cr
        print p.id, p.score, pltc
    print sentCount, errorcount, lgtotal, lgcorrect, lttotal, ltcorrect, allcorrect

    print "sentenceCheckStatsDemo over!!!"
Exemple #7
0
def demo2():
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        trains.append(passage)

#    for p in trains[:30]:
#        essayprepare.processPassage(p)

    for p in trains[:100]:
        # 拆分段落
        print "+++++++++++++++++++++++"
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        pcount1 = len(paras)
        scount1 = 0
        for para in paras:
            sents = essayprepare.markedSentenceTokenize(para)
            #            for sent in sents:
            #            	print "### ", sent
            scount1 += len(sents)
        print "-----------------------"
        paras = essayprepare.para_tokenizer.tokenize(p.passage)
        pcount2 = len(paras)
        scount2 = 0
        for para in paras:
            sents = essayprepare.sent_tokenizer.tokenize(para)
            #            for sent in sents:
            #            	print "### ", sent
            scount2 += len(sents)
        if pcount1 != pcount2 or scount1 != scount2:
            print p.content
            print p.passage
        print "\n"


#    for i, p in enumerate(trains[:30]):
#    	for para in p.paragraphs:
#    		for sent in para.sentences:
#    			for token in sent.tokens:
#    				if token.isSpellError:
#    					print token.token, token.candidates
#    	for m in essays[i].findMarks():
#    		if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw':
#    			print m

#    egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}"
#    parser_evp1 = nltk.RegexpParser(egrammar_vp1)
#
#    for p in trains[:50]:
#        for para in p.paragraphs:
#            for sent in para.sentences:
#                sentence = [(token.token, token.pos) for token in sent.tokens]
#                result = parser_evp1.parse(sentence)
#                r = str(result)
#                if r.find('EVP1') > 0: print r

    print "demo2 over"
Exemple #8
0
    passage.lf = lf
    cf = extractor.extractContentFeather(passage)
    sf = extractor.extractStructureFeather(passage)

    print 'OK'


if __name__ == "__main__":
    print "Start..."
    #sentenceCheckStatsDemo()
    #wordRepetitiveDemo()
    #demo2()
    demo_one_sentence()
    exit()

    essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
    print len(essays)

    essay = None
    for e in essays:
        if e.id == "0092":
            essay = e
            break

    # 文章
    passage = EssayPassage()
    passage.passage = essay.cleanContent()
    passage.title = essay.title
    passage.score = essay.score
    passage.id = essay.id
    passage.reviewerId = essay.reviewerId
 def __loadEssays(self):
     self.essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
     print len(self.essays)
     for e in self.essays:
         self.essayDict[e.id] = e
     print len(self.essayDict)
Exemple #10
0
 def __loadEssays(self):
     self.essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
     print len(self.essays)
     for e in self.essays:
         self.essayDict[e.id] = e
     print len(self.essayDict)
Exemple #11
0
def demo_crossvalidate_zhang():
    print "rater demo_crossvalidate_zhang"
    
    # 读训练集
    essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt")
    trains = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        trains.append(passage)

    # 读测试集
    essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt")
    tests = []
    for essay in essays[:]:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        tests.append(passage)
    

    passages = []
    passages.extend(trains)
    passages.extend(tests)

    random.shuffle(passages)    
    
    scoreEssays = {}
    for p in passages:
        if p.score < 35: p.score = 35
        label = (int(p.score) + 2) / 5 - 4
        if label < 3: 
            label = 3
            #continue
        if label > 14: label = 14
        p.label = label
        if label not in scoreEssays:
            scoreEssays[label] = []
        scoreEssays[label].append(p)


    # cross validate

    ps = [[], [], [], [], []]
    left = []

    for k, v in scoreEssays.items():
        print k
        print len(v)
        if len(v) > 5:
            s = len(v) / 5
            for i in range(5):
                ps[i].extend(v[i*s: (i+1)*s])
            left.extend(v[5*s:])
        else:
            left.extend(v)
    for j in range(len(left)):
        ps[j % 5].append(left[j])
    
    print "data sets: "
    for v in ps:
        print len(v)
    

    for i in range(5):
        trains = []
        tests = []
        
        for j in range(5):
            if i == j:
                tests.extend(ps[j])
            else:
                trains.extend(ps[j])
        
        r = CollegeEssayRater()       
        r.train(trains)
        
        for p in tests:
            s = r.rate(p)
            p.newscore = s[0]
            print p.id, p.score, s
        
    s1 = []
    s2 = []    
    for p in passages:
        if p.label < 3: continue
        s1.append(int(p.score))
        s2.append(p.newscore)
        print p.id, p.score, p.endogScore, int(round(p.newscore)), p.score - int(round(p.newscore)), \
        p.lsaScore, p.lsaSimilarity, p.lsaSimilarityAll, p.lf.tokenCount, \
        p.lf.sentenceLengthAverage, p.lf.wordLengthAverage, p.lf.noneStopWordLengthAverage, \
        p.lf.nounRatio, p.lf.verbRatio, p.lf.adjRatio, p.lf.sentenceLengthSD, p.offsetRatio, \
        p.lf.aclWordCount, p.lf.aclWordRatio
        
    print scistats.pearsonr(s1, s2)      
    
    pkfile = open('zhang_all.pkl', 'w')
    pickle.dump(passages, pkfile)
    pkfile.close()    
        
    print "demo_crossvalidate over!!!"    
Exemple #12
0
def sentenceCheckStatsDemo():
    print "sentenceCheckStatsDemo start..."
    pkfile = open('ustcpassages_503_lt.pkl', 'r')
    passages = pickle.load(pkfile)
    pkfile.close()
    
    sentCount = 0
    errorcount = 0
    lgcorrect = 0
    lgtotal = 0
    ltcorrect = 0
    lttotal = 0
    allcorrect = 0
    
    for p in passages:
        pltc = 0
        osents = []
        for para in p.paragraphs:
            osents.extend(para.sentences)
    
        msents = []
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        for para in paras:
            msents.extend(essayprepare.markedSentenceTokenize(para))
        
        if len(osents) != len(msents):
            print "sentence count not equal", p.id
            print osents
            print msents
            continue
        
        for si, os in enumerate(osents):
            ms = msents[si]
            mkerror = 1
            lgerror = 1
            lterror = 1
            ltc = 0
            
            marks = USTCReader.findMarks(ms)
            onlysperror = True
            for mark in marks:
                if not mark[0] in ['fm1', 'fm2', 'sw']:
                    onlysperror = False
                    break
            if onlysperror: mkerror = 0
            #if ms.find('[') < 0 and ms.find(']') < 0:
            #    mkerror = 0
            if os.canParsed:
                lgerror = 0
            if len(os.ltCheckResults) == 0:
                lterror = 0
            else:
                ltc = len(os.ltCheckResults)
                for cr in os.ltCheckResults:
                    if cr['ruleId'] == 'WHITESPACE_RULE':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'COMMA_PARENTHESIS_WHITESPACE':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'UPPERCASE_SENTENCE_START':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'CAN_NOT':
                        ltc = ltc - 1
                    elif cr['ruleId'] == 'EN_QUOTES':
                        ltc = ltc - 1
                if ltc == 0:
                    lterror = 0
                
            sentCount += 1
            if mkerror == 1: errorcount += 1
            if lgerror == 1: 
                lgtotal += 1
                if lgerror == mkerror:
                    lgcorrect += 1
            if lterror == 1:
                lttotal += 1
                if lterror == mkerror:
                    ltcorrect += 1
                    if lterror == lgerror:
                        allcorrect += 1
            pltc += ltc      
            #print p.id, p.score, len(os.tokens), mkerror, lgerror, lterror, ltc
#            print ms #, #ms, os.sentence
#            print os.sentence
#            if len(os.ltCheckResults) > 0:
#                for cr in os.ltCheckResults:
#                    print cr
        print p.id, p.score, pltc
    print sentCount, errorcount, lgtotal, lgcorrect, lttotal, ltcorrect, allcorrect
        
    print "sentenceCheckStatsDemo over!!!"  
Exemple #13
0
def demo2():
    essays = USTCReader.parseUSTCFile("USTC2011Jan.txt")
    trains = []
    for essay in essays:
        passage = EssayPassage()
        passage.passage = essay.cleanContent()
        passage.title = essay.title
        passage.score = essay.score    
        passage.id = essay.id
        passage.reviewerId = essay.reviewerId
        passage.content = essay.content
        trains.append(passage)
        
#    for p in trains[:30]:
#        essayprepare.processPassage(p)
        
    for p in trains[:100]:
        # 拆分段落
        print "+++++++++++++++++++++++"
        paras = essayprepare.para_tokenizer.tokenize(p.content)
        pcount1 = len(paras)
        scount1 = 0
        for para in paras:
            sents = essayprepare.markedSentenceTokenize(para)  
#            for sent in sents:
#            	print "### ", sent
            scount1 += len(sents)
        print "-----------------------"
        paras = essayprepare.para_tokenizer.tokenize(p.passage)
        pcount2 = len(paras)
        scount2 = 0
        for para in paras:
            sents = essayprepare.sent_tokenizer.tokenize(para)  
#            for sent in sents:
#            	print "### ", sent
            scount2 += len(sents)
        if pcount1 != pcount2 or scount1 != scount2:
            print p.content
            print p.passage
        print "\n"
            	
#    for i, p in enumerate(trains[:30]):
#    	for para in p.paragraphs:
#    		for sent in para.sentences:
#    			for token in sent.tokens:
#    				if token.isSpellError:
#    					print token.token, token.candidates
#    	for m in essays[i].findMarks():
#    		if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw':
#    			print m
        
#    egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}"
#    parser_evp1 = nltk.RegexpParser(egrammar_vp1)    
#
#    for p in trains[:50]:
#        for para in p.paragraphs:
#            for sent in para.sentences:
#                sentence = [(token.token, token.pos) for token in sent.tokens]
#                result = parser_evp1.parse(sentence)
#                r = str(result)
#                if r.find('EVP1') > 0: print r
        
    print "demo2 over" 
Exemple #14
0
    passage.lf = lf
    cf = extractor.extractContentFeather(passage)
    sf = extractor.extractStructureFeather(passage)   
    
    print 'OK'


if __name__ == "__main__":
    print "Start..."  
    #sentenceCheckStatsDemo()
    #wordRepetitiveDemo()
    #demo2()
    demo_one_sentence()
    exit() 
    
    essays = USTCReader.parseUSTCFile('USTC2011Jan.txt')
    print len(essays)
    
    essay = None
    for e in essays:
        if e.id == "0092":
            essay = e
            break  
    
    # 文章
    passage = EssayPassage()
    passage.passage = essay.cleanContent()
    passage.title = essay.title
    passage.score = essay.score
    passage.id = essay.id
    passage.reviewerId = essay.reviewerId