def demo2(): print "rater demo2" # 读训练集 essays = USTCReader.parseUSTCFile("USTC2011Jan.txt") trains = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 训练打分器 r = CollegeEssayRater() r.train(trains) pkfile = open('USTC2011Jan.pkl', 'w') pickle.dump(trains, pkfile) pkfile.close() exit() # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan-tfidf.txt") tests = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) # 打分测试 # for p in tests: # s = r.rate(p) # p.newscore = s[0] # print p.id, p.score, s # # for p in tests: # print p.id, p.score, p.newscore print "SVM......" r.predict(tests) pkfile = open('ustc_test.pkl', 'w') pickle.dump(tests, pkfile) pkfile.close() print "demo2 over!!!"
def demo(): print "rater demo" # 读训练集 essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt") trains = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 训练打分器 r = GeneralEssayRater() r.train(trains) pkfile = open('zhang_trains.pkl', 'w') pickle.dump(trains, pkfile) pkfile.close() # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt") tests = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) # 打分测试 for p in tests: s = r.rate(p) p.newscore = s[0] print p.id, p.score, s for p in tests: print p.id, p.score, p.newscore pkfile = open('zhang_tests.pkl', 'w') pickle.dump(tests, pkfile) pkfile.close() print "demo over!!!"
def demo_one(): content = """At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places.""" # 文章 passage = EssayPassage() passage.passage = content passage.title = 'title' passage.score = 5 passage.id = '1' passage.reviewerId = 3 passage.content = content r = SimpleEssayRater() s = r.rate_by_params(passage) passage.newscore = s[0] print passage.id, passage.score, s print 'OK'
def generateUSTCFeathers(ustcFilename, outFilename): essays = USTCReader.parseUSTCFile(ustcFilename) passages = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content passages.append(passage) generatePassageFeathers(passages[:], outFilename) pkfile = open('ustcpassages_503_lt.pkl', 'w') pickle.dump(passages, pkfile) pkfile.close()
def demo_one_sentence(): # 文章 passage = EssayPassage() passage.passage = 'I am a students.' passage.title = 'title' passage.score = 5 passage.id = '1' passage.reviewerId = 3 passage.content = 'I am a students.' # 处理文章 essayprepare.processPassage(passage) extractor = FeatherExtractor() lf = extractor.extractLangFeather(passage) passage.lf = lf cf = extractor.extractContentFeather(passage) sf = extractor.extractStructureFeather(passage) print 'OK'
def demo2(): essays = USTCReader.parseUSTCFile("USTC2011Jan.txt") trains = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content trains.append(passage) # for p in trains[:30]: # essayprepare.processPassage(p) for p in trains[:100]: # 拆分段落 print "+++++++++++++++++++++++" paras = essayprepare.para_tokenizer.tokenize(p.content) pcount1 = len(paras) scount1 = 0 for para in paras: sents = essayprepare.markedSentenceTokenize(para) # for sent in sents: # print "### ", sent scount1 += len(sents) print "-----------------------" paras = essayprepare.para_tokenizer.tokenize(p.passage) pcount2 = len(paras) scount2 = 0 for para in paras: sents = essayprepare.sent_tokenizer.tokenize(para) # for sent in sents: # print "### ", sent scount2 += len(sents) if pcount1 != pcount2 or scount1 != scount2: print p.content print p.passage print "\n" # for i, p in enumerate(trains[:30]): # for para in p.paragraphs: # for sent in para.sentences: # for token in sent.tokens: # if token.isSpellError: # print token.token, token.candidates # for m in essays[i].findMarks(): # if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw': # print m # egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}" # parser_evp1 = nltk.RegexpParser(egrammar_vp1) # # for p in trains[:50]: # for para in p.paragraphs: # for sent in para.sentences: # sentence = [(token.token, token.pos) for token in sent.tokens] # result = parser_evp1.parse(sentence) # r = str(result) # if r.find('EVP1') > 0: print r print "demo2 over"
demo_one_sentence() exit() essays = USTCReader.parseUSTCFile('USTC2011Jan.txt') print len(essays) essay = None for e in essays: if e.id == "0092": essay = e break # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 print "PASSAGE=========================================" print passage print passage.id print passage.title print passage.score print passage.passage
def processEssay(self): self.browser.clear() id = unicode(self.lineedit.text()) essay = self.essayDict.get(id) if not essay: self.browser.append("<font color=red>%s is not found!</font>" % id) return self.browser.append(essay.content) # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 self.browser.append("PASSAGE=========================================") self.browser.append(passage.id) #self.browser.append(passage.title) self.browser.append(passage.score) self.browser.append(passage.passage) self.browser.append(str(len(passage.paragraphs))) self.browser.append("PARAGRAPHS---------------------------------------") for para in passage.paragraphs: self.browser.append(str(para.paragraphNo)) self.browser.append(para.paragraph) for sent in para.sentences: self.browser.append(str(sent.sentenceNo)) self.browser.append(str(sent.paragraphSentenceNo)) self.browser.append(sent.sentence) tokens = [token.token for token in sent.tokens] tags = [token.pos for token in sent.tokens] lemmas = [token.lemma for token in sent.tokens] stems = [token.stem for token in sent.tokens] levels = [token.level for token in sent.tokens] nos = [token.tokenNo for token in sent.tokens] sentNos = [token.sentenceTokenNo for token in sent.tokens] paraNos = [token.paragraphTokenNo for token in sent.tokens] errorTokens = [token.token for token in sent.tokens if token.isSpellError] if not sent.canParsed: self.browser.append("<font color=red>SENTENCE ERROR</font>") self.browser.append("<font color=red>SPELLERROR %s</font>" % str(errorTokens)) self.browser.append(str(tokens)) self.browser.append(str(tags)) self.browser.append(str(lemmas)) self.browser.append(str(stems)) self.browser.append(str(levels)) self.browser.append(str(sentNos)) self.browser.append(str(paraNos)) self.browser.append(str(nos)) self.browser.append(str(sent.tokenCount)) self.browser.append(str(sent.wordCount)) self.browser.append(str(sent.realWordCount)) self.browser.append(u"三元词组" + ' ' + str(passage.trigrams)) e = FeatherExtractor() # 提取语言特征 languageFeather = e.extractLangFeather(passage) print u"词次总数", languageFeather.tokenCount print u"单词总数", languageFeather.wordCount print u"词形总数", languageFeather.wordTypeCount print u"词元总数", languageFeather.wordLemmaCount print u"介词个数", languageFeather.prepositionCount print u"介词比例", languageFeather.prepositionRatio print u"介词使用", languageFeather.prepositionUse print u"定冠词个数", languageFeather.definiteArticleCount print u"定冠词比例", languageFeather.definiteArticleRatio print u"定冠词使用", languageFeather.definiteArticleUse # 提取结构特征 #structureFeather = e.extractStructureFeather(passage) #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt') print "...OVER"
def processEssay(self): self.browser.clear() id = unicode(self.lineedit.text()) essay = self.essayDict.get(id) if not essay: self.browser.append("<font color=red>%s is not found!</font>" % id) return self.browser.append(essay.content) # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 self.browser.append("PASSAGE=========================================") self.browser.append(passage.id) #self.browser.append(passage.title) self.browser.append(passage.score) self.browser.append(passage.passage) self.browser.append(str(len(passage.paragraphs))) self.browser.append( "PARAGRAPHS---------------------------------------") for para in passage.paragraphs: self.browser.append(str(para.paragraphNo)) self.browser.append(para.paragraph) for sent in para.sentences: self.browser.append(str(sent.sentenceNo)) self.browser.append(str(sent.paragraphSentenceNo)) self.browser.append(sent.sentence) tokens = [token.token for token in sent.tokens] tags = [token.pos for token in sent.tokens] lemmas = [token.lemma for token in sent.tokens] stems = [token.stem for token in sent.tokens] levels = [token.level for token in sent.tokens] nos = [token.tokenNo for token in sent.tokens] sentNos = [token.sentenceTokenNo for token in sent.tokens] paraNos = [token.paragraphTokenNo for token in sent.tokens] errorTokens = [ token.token for token in sent.tokens if token.isSpellError ] if not sent.canParsed: self.browser.append( "<font color=red>SENTENCE ERROR</font>") self.browser.append("<font color=red>SPELLERROR %s</font>" % str(errorTokens)) self.browser.append(str(tokens)) self.browser.append(str(tags)) self.browser.append(str(lemmas)) self.browser.append(str(stems)) self.browser.append(str(levels)) self.browser.append(str(sentNos)) self.browser.append(str(paraNos)) self.browser.append(str(nos)) self.browser.append(str(sent.tokenCount)) self.browser.append(str(sent.wordCount)) self.browser.append(str(sent.realWordCount)) self.browser.append(u"三元词组" + ' ' + str(passage.trigrams)) e = FeatherExtractor() # 提取语言特征 languageFeather = e.extractLangFeather(passage) print u"词次总数", languageFeather.tokenCount print u"单词总数", languageFeather.wordCount print u"词形总数", languageFeather.wordTypeCount print u"词元总数", languageFeather.wordLemmaCount print u"介词个数", languageFeather.prepositionCount print u"介词比例", languageFeather.prepositionRatio print u"介词使用", languageFeather.prepositionUse print u"定冠词个数", languageFeather.definiteArticleCount print u"定冠词比例", languageFeather.definiteArticleRatio print u"定冠词使用", languageFeather.definiteArticleUse # 提取结构特征 #structureFeather = e.extractStructureFeather(passage) #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt') print "...OVER"
def demo_crossvalidate_zhang(): print "rater demo_crossvalidate_zhang" # 读训练集 essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt") trains = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt") tests = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) passages = [] passages.extend(trains) passages.extend(tests) random.shuffle(passages) scoreEssays = {} for p in passages: if p.score < 35: p.score = 35 label = (int(p.score) + 2) / 5 - 4 if label < 3: label = 3 #continue if label > 14: label = 14 p.label = label if label not in scoreEssays: scoreEssays[label] = [] scoreEssays[label].append(p) # cross validate ps = [[], [], [], [], []] left = [] for k, v in scoreEssays.items(): print k print len(v) if len(v) > 5: s = len(v) / 5 for i in range(5): ps[i].extend(v[i*s: (i+1)*s]) left.extend(v[5*s:]) else: left.extend(v) for j in range(len(left)): ps[j % 5].append(left[j]) print "data sets: " for v in ps: print len(v) for i in range(5): trains = [] tests = [] for j in range(5): if i == j: tests.extend(ps[j]) else: trains.extend(ps[j]) r = CollegeEssayRater() r.train(trains) for p in tests: s = r.rate(p) p.newscore = s[0] print p.id, p.score, s s1 = [] s2 = [] for p in passages: if p.label < 3: continue s1.append(int(p.score)) s2.append(p.newscore) print p.id, p.score, p.endogScore, int(round(p.newscore)), p.score - int(round(p.newscore)), \ p.lsaScore, p.lsaSimilarity, p.lsaSimilarityAll, p.lf.tokenCount, \ p.lf.sentenceLengthAverage, p.lf.wordLengthAverage, p.lf.noneStopWordLengthAverage, \ p.lf.nounRatio, p.lf.verbRatio, p.lf.adjRatio, p.lf.sentenceLengthSD, p.offsetRatio, \ p.lf.aclWordCount, p.lf.aclWordRatio print scistats.pearsonr(s1, s2) pkfile = open('zhang_all.pkl', 'w') pickle.dump(passages, pkfile) pkfile.close() print "demo_crossvalidate over!!!"