def demo2(): print "rater demo2" # 读训练集 essays = USTCReader.parseUSTCFile("USTC2011Jan.txt") trains = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 训练打分器 r = CollegeEssayRater() r.train(trains) pkfile = open('USTC2011Jan.pkl', 'w') pickle.dump(trains, pkfile) pkfile.close() exit() # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan-tfidf.txt") tests = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) # 打分测试 # for p in tests: # s = r.rate(p) # p.newscore = s[0] # print p.id, p.score, s # # for p in tests: # print p.id, p.score, p.newscore print "SVM......" r.predict(tests) pkfile = open('ustc_test.pkl', 'w') pickle.dump(tests, pkfile) pkfile.close() print "demo2 over!!!"
def demo(): print "rater demo" # 读训练集 essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt") trains = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 训练打分器 r = GeneralEssayRater() r.train(trains) pkfile = open('zhang_trains.pkl', 'w') pickle.dump(trains, pkfile) pkfile.close() # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt") tests = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) # 打分测试 for p in tests: s = r.rate(p) p.newscore = s[0] print p.id, p.score, s for p in tests: print p.id, p.score, p.newscore pkfile = open('zhang_tests.pkl', 'w') pickle.dump(tests, pkfile) pkfile.close() print "demo over!!!"
def generateUSTCFeathers(ustcFilename, outFilename): essays = USTCReader.parseUSTCFile(ustcFilename) passages = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content passages.append(passage) generatePassageFeathers(passages[:], outFilename) pkfile = open('ustcpassages_503_lt.pkl', 'w') pickle.dump(passages, pkfile) pkfile.close()
def sentenceCheckStatsDemo(): print "sentenceCheckStatsDemo start..." pkfile = open('ustcpassages_503_lt.pkl', 'r') passages = pickle.load(pkfile) pkfile.close() sentCount = 0 errorcount = 0 lgcorrect = 0 lgtotal = 0 ltcorrect = 0 lttotal = 0 allcorrect = 0 for p in passages: pltc = 0 osents = [] for para in p.paragraphs: osents.extend(para.sentences) msents = [] paras = essayprepare.para_tokenizer.tokenize(p.content) for para in paras: msents.extend(essayprepare.markedSentenceTokenize(para)) if len(osents) != len(msents): print "sentence count not equal", p.id print osents print msents continue for si, os in enumerate(osents): ms = msents[si] mkerror = 1 lgerror = 1 lterror = 1 ltc = 0 marks = USTCReader.findMarks(ms) onlysperror = True for mark in marks: if not mark[0] in ['fm1', 'fm2', 'sw']: onlysperror = False break if onlysperror: mkerror = 0 #if ms.find('[') < 0 and ms.find(']') < 0: # mkerror = 0 if os.canParsed: lgerror = 0 if len(os.ltCheckResults) == 0: lterror = 0 else: ltc = len(os.ltCheckResults) for cr in os.ltCheckResults: if cr['ruleId'] == 'WHITESPACE_RULE': ltc = ltc - 1 elif cr['ruleId'] == 'COMMA_PARENTHESIS_WHITESPACE': ltc = ltc - 1 elif cr['ruleId'] == 'UPPERCASE_SENTENCE_START': ltc = ltc - 1 elif cr['ruleId'] == 'CAN_NOT': ltc = ltc - 1 elif cr['ruleId'] == 'EN_QUOTES': ltc = ltc - 1 if ltc == 0: lterror = 0 sentCount += 1 if mkerror == 1: errorcount += 1 if lgerror == 1: lgtotal += 1 if lgerror == mkerror: lgcorrect += 1 if lterror == 1: lttotal += 1 if lterror == mkerror: ltcorrect += 1 if lterror == lgerror: allcorrect += 1 pltc += ltc #print p.id, p.score, len(os.tokens), mkerror, lgerror, lterror, ltc # print ms #, #ms, os.sentence # print os.sentence # if len(os.ltCheckResults) > 0: # for cr in os.ltCheckResults: # print cr print p.id, p.score, pltc print sentCount, errorcount, lgtotal, lgcorrect, lttotal, ltcorrect, allcorrect print "sentenceCheckStatsDemo over!!!"
def demo2(): essays = USTCReader.parseUSTCFile("USTC2011Jan.txt") trains = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content trains.append(passage) # for p in trains[:30]: # essayprepare.processPassage(p) for p in trains[:100]: # 拆分段落 print "+++++++++++++++++++++++" paras = essayprepare.para_tokenizer.tokenize(p.content) pcount1 = len(paras) scount1 = 0 for para in paras: sents = essayprepare.markedSentenceTokenize(para) # for sent in sents: # print "### ", sent scount1 += len(sents) print "-----------------------" paras = essayprepare.para_tokenizer.tokenize(p.passage) pcount2 = len(paras) scount2 = 0 for para in paras: sents = essayprepare.sent_tokenizer.tokenize(para) # for sent in sents: # print "### ", sent scount2 += len(sents) if pcount1 != pcount2 or scount1 != scount2: print p.content print p.passage print "\n" # for i, p in enumerate(trains[:30]): # for para in p.paragraphs: # for sent in para.sentences: # for token in sent.tokens: # if token.isSpellError: # print token.token, token.candidates # for m in essays[i].findMarks(): # if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw': # print m # egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}" # parser_evp1 = nltk.RegexpParser(egrammar_vp1) # # for p in trains[:50]: # for para in p.paragraphs: # for sent in para.sentences: # sentence = [(token.token, token.pos) for token in sent.tokens] # result = parser_evp1.parse(sentence) # r = str(result) # if r.find('EVP1') > 0: print r print "demo2 over"
passage.lf = lf cf = extractor.extractContentFeather(passage) sf = extractor.extractStructureFeather(passage) print 'OK' if __name__ == "__main__": print "Start..." #sentenceCheckStatsDemo() #wordRepetitiveDemo() #demo2() demo_one_sentence() exit() essays = USTCReader.parseUSTCFile('USTC2011Jan.txt') print len(essays) essay = None for e in essays: if e.id == "0092": essay = e break # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId
def __loadEssays(self): self.essays = USTCReader.parseUSTCFile('USTC2011Jan.txt') print len(self.essays) for e in self.essays: self.essayDict[e.id] = e print len(self.essayDict)
def demo_crossvalidate_zhang(): print "rater demo_crossvalidate_zhang" # 读训练集 essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt") trains = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt") tests = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) passages = [] passages.extend(trains) passages.extend(tests) random.shuffle(passages) scoreEssays = {} for p in passages: if p.score < 35: p.score = 35 label = (int(p.score) + 2) / 5 - 4 if label < 3: label = 3 #continue if label > 14: label = 14 p.label = label if label not in scoreEssays: scoreEssays[label] = [] scoreEssays[label].append(p) # cross validate ps = [[], [], [], [], []] left = [] for k, v in scoreEssays.items(): print k print len(v) if len(v) > 5: s = len(v) / 5 for i in range(5): ps[i].extend(v[i*s: (i+1)*s]) left.extend(v[5*s:]) else: left.extend(v) for j in range(len(left)): ps[j % 5].append(left[j]) print "data sets: " for v in ps: print len(v) for i in range(5): trains = [] tests = [] for j in range(5): if i == j: tests.extend(ps[j]) else: trains.extend(ps[j]) r = CollegeEssayRater() r.train(trains) for p in tests: s = r.rate(p) p.newscore = s[0] print p.id, p.score, s s1 = [] s2 = [] for p in passages: if p.label < 3: continue s1.append(int(p.score)) s2.append(p.newscore) print p.id, p.score, p.endogScore, int(round(p.newscore)), p.score - int(round(p.newscore)), \ p.lsaScore, p.lsaSimilarity, p.lsaSimilarityAll, p.lf.tokenCount, \ p.lf.sentenceLengthAverage, p.lf.wordLengthAverage, p.lf.noneStopWordLengthAverage, \ p.lf.nounRatio, p.lf.verbRatio, p.lf.adjRatio, p.lf.sentenceLengthSD, p.offsetRatio, \ p.lf.aclWordCount, p.lf.aclWordRatio print scistats.pearsonr(s1, s2) pkfile = open('zhang_all.pkl', 'w') pickle.dump(passages, pkfile) pkfile.close() print "demo_crossvalidate over!!!"