def demo2(): print "rater demo2" # 读训练集 essays = USTCReader.parseUSTCFile("USTC2011Jan.txt") trains = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 训练打分器 r = CollegeEssayRater() r.train(trains) pkfile = open('USTC2011Jan.pkl', 'w') pickle.dump(trains, pkfile) pkfile.close() exit() # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan-tfidf.txt") tests = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) # 打分测试 # for p in tests: # s = r.rate(p) # p.newscore = s[0] # print p.id, p.score, s # # for p in tests: # print p.id, p.score, p.newscore print "SVM......" r.predict(tests) pkfile = open('ustc_test.pkl', 'w') pickle.dump(tests, pkfile) pkfile.close() print "demo2 over!!!"
def demo(): print "rater demo" # 读训练集 essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt") trains = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 训练打分器 r = GeneralEssayRater() r.train(trains) pkfile = open('zhang_trains.pkl', 'w') pickle.dump(trains, pkfile) pkfile.close() # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt") tests = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) # 打分测试 for p in tests: s = r.rate(p) p.newscore = s[0] print p.id, p.score, s for p in tests: print p.id, p.score, p.newscore pkfile = open('zhang_tests.pkl', 'w') pickle.dump(tests, pkfile) pkfile.close() print "demo over!!!"
def demo_one(): content = """At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places.""" # 文章 passage = EssayPassage() passage.passage = content passage.title = 'title' passage.score = 5 passage.id = '1' passage.reviewerId = 3 passage.content = content r = SimpleEssayRater() s = r.rate_by_params(passage) passage.newscore = s[0] print passage.id, passage.score, s print 'OK'
def generateUSTCFeathers(ustcFilename, outFilename): essays = USTCReader.parseUSTCFile(ustcFilename) passages = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content passages.append(passage) generatePassageFeathers(passages[:], outFilename) pkfile = open('ustcpassages_503_lt.pkl', 'w') pickle.dump(passages, pkfile) pkfile.close()
def demo_one_sentence(): # 文章 passage = EssayPassage() passage.passage = 'I am a students.' passage.title = 'title' passage.score = 5 passage.id = '1' passage.reviewerId = 3 passage.content = 'I am a students.' # 处理文章 essayprepare.processPassage(passage) extractor = FeatherExtractor() lf = extractor.extractLangFeather(passage) passage.lf = lf cf = extractor.extractContentFeather(passage) sf = extractor.extractStructureFeather(passage) print 'OK'
def demo2(): essays = USTCReader.parseUSTCFile("USTC2011Jan.txt") trains = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content trains.append(passage) # for p in trains[:30]: # essayprepare.processPassage(p) for p in trains[:100]: # 拆分段落 print "+++++++++++++++++++++++" paras = essayprepare.para_tokenizer.tokenize(p.content) pcount1 = len(paras) scount1 = 0 for para in paras: sents = essayprepare.markedSentenceTokenize(para) # for sent in sents: # print "### ", sent scount1 += len(sents) print "-----------------------" paras = essayprepare.para_tokenizer.tokenize(p.passage) pcount2 = len(paras) scount2 = 0 for para in paras: sents = essayprepare.sent_tokenizer.tokenize(para) # for sent in sents: # print "### ", sent scount2 += len(sents) if pcount1 != pcount2 or scount1 != scount2: print p.content print p.passage print "\n" # for i, p in enumerate(trains[:30]): # for para in p.paragraphs: # for sent in para.sentences: # for token in sent.tokens: # if token.isSpellError: # print token.token, token.candidates # for m in essays[i].findMarks(): # if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw': # print m # egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}" # parser_evp1 = nltk.RegexpParser(egrammar_vp1) # # for p in trains[:50]: # for para in p.paragraphs: # for sent in para.sentences: # sentence = [(token.token, token.pos) for token in sent.tokens] # result = parser_evp1.parse(sentence) # r = str(result) # if r.find('EVP1') > 0: print r print "demo2 over"
essays = USTCReader.parseUSTCFile('USTC2011Jan.txt') print len(essays) essay = None for e in essays: if e.id == "0092": essay = e break # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 print "PASSAGE=========================================" print passage print passage.id print passage.title print passage.score print passage.passage print len(passage.paragraphs) print "PARAGRAPHS---------------------------------------" for para in passage.paragraphs:
def demo_crossvalidate_zhang(): print "rater demo_crossvalidate_zhang" # 读训练集 essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt") trains = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt") tests = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) passages = [] passages.extend(trains) passages.extend(tests) random.shuffle(passages) scoreEssays = {} for p in passages: if p.score < 35: p.score = 35 label = (int(p.score) + 2) / 5 - 4 if label < 3: label = 3 #continue if label > 14: label = 14 p.label = label if label not in scoreEssays: scoreEssays[label] = [] scoreEssays[label].append(p) # cross validate ps = [[], [], [], [], []] left = [] for k, v in scoreEssays.items(): print k print len(v) if len(v) > 5: s = len(v) / 5 for i in range(5): ps[i].extend(v[i*s: (i+1)*s]) left.extend(v[5*s:]) else: left.extend(v) for j in range(len(left)): ps[j % 5].append(left[j]) print "data sets: " for v in ps: print len(v) for i in range(5): trains = [] tests = [] for j in range(5): if i == j: tests.extend(ps[j]) else: trains.extend(ps[j]) r = CollegeEssayRater() r.train(trains) for p in tests: s = r.rate(p) p.newscore = s[0] print p.id, p.score, s s1 = [] s2 = [] for p in passages: if p.label < 3: continue s1.append(int(p.score)) s2.append(p.newscore) print p.id, p.score, p.endogScore, int(round(p.newscore)), p.score - int(round(p.newscore)), \ p.lsaScore, p.lsaSimilarity, p.lsaSimilarityAll, p.lf.tokenCount, \ p.lf.sentenceLengthAverage, p.lf.wordLengthAverage, p.lf.noneStopWordLengthAverage, \ p.lf.nounRatio, p.lf.verbRatio, p.lf.adjRatio, p.lf.sentenceLengthSD, p.offsetRatio, \ p.lf.aclWordCount, p.lf.aclWordRatio print scistats.pearsonr(s1, s2) pkfile = open('zhang_all.pkl', 'w') pickle.dump(passages, pkfile) pkfile.close() print "demo_crossvalidate over!!!"