def demo_one(): content = """At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places. At present ,more and more students in the college are encouraged to go to the poor places for aid education . This activity is of great benefits for both our college students and the poor places.""" # 文章 passage = EssayPassage() passage.passage = content passage.title = 'title' passage.score = 5 passage.id = '1' passage.reviewerId = 3 passage.content = content r = SimpleEssayRater() s = r.rate_by_params(passage) passage.newscore = s[0] print passage.id, passage.score, s print 'OK'
def demo_one_sentence(): # 文章 passage = EssayPassage() passage.passage = 'I am a students.' passage.title = 'title' passage.score = 5 passage.id = '1' passage.reviewerId = 3 passage.content = 'I am a students.' # 处理文章 essayprepare.processPassage(passage) extractor = FeatherExtractor() lf = extractor.extractLangFeather(passage) passage.lf = lf cf = extractor.extractContentFeather(passage) sf = extractor.extractStructureFeather(passage) print 'OK'
def simlarityTest(): pkfile = open('rater.pkl', 'r') rater = pickle.load(pkfile) pkfile.close() essays = CLECReader.parseCLECFile2('clecst/ST3.txt') print len(essays) essayDict = {} for e in essays: if not essayDict.has_key(e.title): essayDict[e.title] = [] essayDict[e.title].append(e) print essayDict.keys() for k, v in essayDict.items(): print len(v), k passages = [] count = 0 for e in essayDict['Global Shortage of Fresh Water'][:120]: count += 1 newpassage = EssayPassage() newpassage.passage = e.content newpassage.id = str(count) newpassage.score = e.score newpassage.processStatus = 0 passages.append(newpassage) rater.rate(newpassage) for p in passages: print p.score, p.rateScore, p.lsaSimilarity, p.lsaSimilarityAll print "OK"
def generateUSTCFeathers(ustcFilename, outFilename): essays = USTCReader.parseUSTCFile(ustcFilename) passages = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content passages.append(passage) generatePassageFeathers(passages[:], outFilename) pkfile = open('ustcpassages_503_lt.pkl', 'w') pickle.dump(passages, pkfile) pkfile.close()
def demo2(): essays = USTCReader.parseUSTCFile("USTC2011Jan.txt") trains = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content trains.append(passage) # for p in trains[:30]: # essayprepare.processPassage(p) for p in trains[:100]: # 拆分段落 print "+++++++++++++++++++++++" paras = essayprepare.para_tokenizer.tokenize(p.content) pcount1 = len(paras) scount1 = 0 for para in paras: sents = essayprepare.markedSentenceTokenize(para) # for sent in sents: # print "### ", sent scount1 += len(sents) print "-----------------------" paras = essayprepare.para_tokenizer.tokenize(p.passage) pcount2 = len(paras) scount2 = 0 for para in paras: sents = essayprepare.sent_tokenizer.tokenize(para) # for sent in sents: # print "### ", sent scount2 += len(sents) if pcount1 != pcount2 or scount1 != scount2: print p.content print p.passage print "\n" # for i, p in enumerate(trains[:30]): # for para in p.paragraphs: # for sent in para.sentences: # for token in sent.tokens: # if token.isSpellError: # print token.token, token.candidates # for m in essays[i].findMarks(): # if m[0] == 'fm1' or m[0] == 'fm2' or m[0] == 'sw': # print m # egrammar_vp1 = "EVP1: {<NN><RB>?<VB>}" # parser_evp1 = nltk.RegexpParser(egrammar_vp1) # # for p in trains[:50]: # for para in p.paragraphs: # for sent in para.sentences: # sentence = [(token.token, token.pos) for token in sent.tokens] # result = parser_evp1.parse(sentence) # r = str(result) # if r.find('EVP1') > 0: print r print "demo2 over"
#wordRepetitiveDemo() #demo2() demo_one_sentence() exit() essays = USTCReader.parseUSTCFile('USTC2011Jan.txt') print len(essays) essay = None for e in essays: if e.id == "0092": essay = e break # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 print "PASSAGE=========================================" print passage print passage.id print passage.title
def do_task(task): newpassage = EssayPassage() newpassage.passage = task['input']['content'] newpassage.orderId = task['id'] newpassage.score = 0 newpassage.processStatus = 0 try: essayprepare.processPassage(newpassage, fn_prepare_progress) fe = extractor.FeatherExtractor() lf = fe.extractLangFeather(newpassage) newpassage.lf = lf cf = fe.extractContentFeather(newpassage) newpassage.cf = cf sf = fe.extractStructureFeather(newpassage) newpassage.sf = sf newpassage.score = rater.rate_by_params(newpassage)[0] except: task['progress'] = -2 task['status'] = 'TUTERR' task['output'] = "" task['simple_output'] = "" task['detail_output'] = "" commit_task(task) return # 生成最终结果 output = {} passage = {} passage['score'] = newpassage.score passage['token_count'] = lf.tokenCount passage['word_count'] = lf.wordCount passage['word_type_count'] = lf.wordTypeCount passage['word_lemma_count'] = lf.wordLemmaCount passage['word_stem_count'] = lf.wordStemCount passage['average_word_length'] = lf.wordLengthAverage passage['average_sentence_length'] = lf.sentenceLengthAverage passage['overly_use_word_count'] = lf.overlyUseWordCount passage['paragraph_count'] = len(newpassage.paragraphs) passage['sentence_count'] = newpassage.sentenceCount passage['sentences'] = [] for para in newpassage.paragraphs: for sent in para.sentences: sentence = {} sentence['no'] = sent.sentenceNo sentence['para_no'] = para.paragraphNo sentence['original'] = sent.sentence sentence['score'] = 0 spell_errors = [] fs = [] for token in sent.tokens: if token.isSpellError: fs.append('<ESP>' + token.token + '</ESP>') spell_error = {} spell_error['token'] = token.token spell_error['lemma'] = token.lemma spell_error['suggest'] = token.candidates spell_error['start_at'] = token.startAt spell_error['end_at'] = token.endAt spell_errors.append(spell_error) else: fs.append(token.token) sentence['spell_errors'] = spell_errors sentence['marked'] = ' '.join(fs) sentence['lt_result'] = sent.ltCheckResults sentence['lg_result'] = sent.lgCheckResults sentence['links'] = [] passage['sentences'].append(sentence) output['passage'] = passage task['progress'] = 100 task['status'] = 'DONE' task['output'] = json.dumps(output) task['simple_output'] = json.dumps(output) task['detail_output'] = json.dumps(generate_detail_output(newpassage)) commit_task(task)
def processEssay(self): self.browser.clear() id = unicode(self.lineedit.text()) essay = self.essayDict.get(id) if not essay: self.browser.append("<font color=red>%s is not found!</font>" % id) return self.browser.append(essay.content) # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 self.browser.append("PASSAGE=========================================") self.browser.append(passage.id) #self.browser.append(passage.title) self.browser.append(passage.score) self.browser.append(passage.passage) self.browser.append(str(len(passage.paragraphs))) self.browser.append("PARAGRAPHS---------------------------------------") for para in passage.paragraphs: self.browser.append(str(para.paragraphNo)) self.browser.append(para.paragraph) for sent in para.sentences: self.browser.append(str(sent.sentenceNo)) self.browser.append(str(sent.paragraphSentenceNo)) self.browser.append(sent.sentence) tokens = [token.token for token in sent.tokens] tags = [token.pos for token in sent.tokens] lemmas = [token.lemma for token in sent.tokens] stems = [token.stem for token in sent.tokens] levels = [token.level for token in sent.tokens] nos = [token.tokenNo for token in sent.tokens] sentNos = [token.sentenceTokenNo for token in sent.tokens] paraNos = [token.paragraphTokenNo for token in sent.tokens] errorTokens = [token.token for token in sent.tokens if token.isSpellError] if not sent.canParsed: self.browser.append("<font color=red>SENTENCE ERROR</font>") self.browser.append("<font color=red>SPELLERROR %s</font>" % str(errorTokens)) self.browser.append(str(tokens)) self.browser.append(str(tags)) self.browser.append(str(lemmas)) self.browser.append(str(stems)) self.browser.append(str(levels)) self.browser.append(str(sentNos)) self.browser.append(str(paraNos)) self.browser.append(str(nos)) self.browser.append(str(sent.tokenCount)) self.browser.append(str(sent.wordCount)) self.browser.append(str(sent.realWordCount)) self.browser.append(u"三元词组" + ' ' + str(passage.trigrams)) e = FeatherExtractor() # 提取语言特征 languageFeather = e.extractLangFeather(passage) print u"词次总数", languageFeather.tokenCount print u"单词总数", languageFeather.wordCount print u"词形总数", languageFeather.wordTypeCount print u"词元总数", languageFeather.wordLemmaCount print u"介词个数", languageFeather.prepositionCount print u"介词比例", languageFeather.prepositionRatio print u"介词使用", languageFeather.prepositionUse print u"定冠词个数", languageFeather.definiteArticleCount print u"定冠词比例", languageFeather.definiteArticleRatio print u"定冠词使用", languageFeather.definiteArticleUse # 提取结构特征 #structureFeather = e.extractStructureFeather(passage) #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt') print "...OVER"
orderId = 1 waitingPassages = [] donePassages = {} passage = None while True: request = socket.recv() print request try: rs = json.loads(request) except: socket.send("") continue if rs['ACTION'] == 'SUBMIT': orderId += 1 newpassage = EssayPassage() newpassage.passage = rs['text'] newpassage.orderId = orderId newpassage.score = 0 newpassage.processStatus = 0 waitingPassages.append(newpassage) if ((not passage) or passage.rated) and len(waitingPassages) > 0: passage = waitingPassages.pop(0) donePassages[passage.orderId] = passage rthread = RatePassageThread(rater, passage) rthread.start() reply = json.dumps({'orderId':orderId, 'progress':0, 'rated':0}) socket.send_unicode(reply) elif rs['ACTION'] == 'QUERY': oId = int(rs['orderId']) if not oId in donePassages:
def demo(): print "rater demo" # 读训练集 essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt") trains = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 训练打分器 r = GeneralEssayRater() r.train(trains) pkfile = open('zhang_trains.pkl', 'w') pickle.dump(trains, pkfile) pkfile.close() # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt") tests = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) # 打分测试 for p in tests: s = r.rate(p) p.newscore = s[0] print p.id, p.score, s for p in tests: print p.id, p.score, p.newscore pkfile = open('zhang_tests.pkl', 'w') pickle.dump(tests, pkfile) pkfile.close() print "demo over!!!"
def demo2(): print "rater demo2" # 读训练集 essays = USTCReader.parseUSTCFile("USTC2011Jan.txt") trains = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 训练打分器 r = CollegeEssayRater() r.train(trains) pkfile = open('USTC2011Jan.pkl', 'w') pickle.dump(trains, pkfile) pkfile.close() exit() # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan-tfidf.txt") tests = [] for essay in essays: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) # 打分测试 # for p in tests: # s = r.rate(p) # p.newscore = s[0] # print p.id, p.score, s # # for p in tests: # print p.id, p.score, p.newscore print "SVM......" r.predict(tests) pkfile = open('ustc_test.pkl', 'w') pickle.dump(tests, pkfile) pkfile.close() print "demo2 over!!!"
def demo_crossvalidate_zhang(): print "rater demo_crossvalidate_zhang" # 读训练集 essays = USTCReader.parseUSTCFile("essayreader/r1_265.txt") trains = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId trains.append(passage) # 读测试集 essays = USTCReader.parseUSTCFile("USTC2011Jan_Parallel_Zhang.txt") tests = [] for essay in essays[:]: passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id tests.append(passage) passages = [] passages.extend(trains) passages.extend(tests) random.shuffle(passages) scoreEssays = {} for p in passages: if p.score < 35: p.score = 35 label = (int(p.score) + 2) / 5 - 4 if label < 3: label = 3 #continue if label > 14: label = 14 p.label = label if label not in scoreEssays: scoreEssays[label] = [] scoreEssays[label].append(p) # cross validate ps = [[], [], [], [], []] left = [] for k, v in scoreEssays.items(): print k print len(v) if len(v) > 5: s = len(v) / 5 for i in range(5): ps[i].extend(v[i*s: (i+1)*s]) left.extend(v[5*s:]) else: left.extend(v) for j in range(len(left)): ps[j % 5].append(left[j]) print "data sets: " for v in ps: print len(v) for i in range(5): trains = [] tests = [] for j in range(5): if i == j: tests.extend(ps[j]) else: trains.extend(ps[j]) r = CollegeEssayRater() r.train(trains) for p in tests: s = r.rate(p) p.newscore = s[0] print p.id, p.score, s s1 = [] s2 = [] for p in passages: if p.label < 3: continue s1.append(int(p.score)) s2.append(p.newscore) print p.id, p.score, p.endogScore, int(round(p.newscore)), p.score - int(round(p.newscore)), \ p.lsaScore, p.lsaSimilarity, p.lsaSimilarityAll, p.lf.tokenCount, \ p.lf.sentenceLengthAverage, p.lf.wordLengthAverage, p.lf.noneStopWordLengthAverage, \ p.lf.nounRatio, p.lf.verbRatio, p.lf.adjRatio, p.lf.sentenceLengthSD, p.offsetRatio, \ p.lf.aclWordCount, p.lf.aclWordRatio print scistats.pearsonr(s1, s2) pkfile = open('zhang_all.pkl', 'w') pickle.dump(passages, pkfile) pkfile.close() print "demo_crossvalidate over!!!"
def processEssay(self): self.browser.clear() id = unicode(self.lineedit.text()) essay = self.essayDict.get(id) if not essay: self.browser.append("<font color=red>%s is not found!</font>" % id) return self.browser.append(essay.content) # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 self.browser.append("PASSAGE=========================================") self.browser.append(passage.id) #self.browser.append(passage.title) self.browser.append(passage.score) self.browser.append(passage.passage) self.browser.append(str(len(passage.paragraphs))) self.browser.append( "PARAGRAPHS---------------------------------------") for para in passage.paragraphs: self.browser.append(str(para.paragraphNo)) self.browser.append(para.paragraph) for sent in para.sentences: self.browser.append(str(sent.sentenceNo)) self.browser.append(str(sent.paragraphSentenceNo)) self.browser.append(sent.sentence) tokens = [token.token for token in sent.tokens] tags = [token.pos for token in sent.tokens] lemmas = [token.lemma for token in sent.tokens] stems = [token.stem for token in sent.tokens] levels = [token.level for token in sent.tokens] nos = [token.tokenNo for token in sent.tokens] sentNos = [token.sentenceTokenNo for token in sent.tokens] paraNos = [token.paragraphTokenNo for token in sent.tokens] errorTokens = [ token.token for token in sent.tokens if token.isSpellError ] if not sent.canParsed: self.browser.append( "<font color=red>SENTENCE ERROR</font>") self.browser.append("<font color=red>SPELLERROR %s</font>" % str(errorTokens)) self.browser.append(str(tokens)) self.browser.append(str(tags)) self.browser.append(str(lemmas)) self.browser.append(str(stems)) self.browser.append(str(levels)) self.browser.append(str(sentNos)) self.browser.append(str(paraNos)) self.browser.append(str(nos)) self.browser.append(str(sent.tokenCount)) self.browser.append(str(sent.wordCount)) self.browser.append(str(sent.realWordCount)) self.browser.append(u"三元词组" + ' ' + str(passage.trigrams)) e = FeatherExtractor() # 提取语言特征 languageFeather = e.extractLangFeather(passage) print u"词次总数", languageFeather.tokenCount print u"单词总数", languageFeather.wordCount print u"词形总数", languageFeather.wordTypeCount print u"词元总数", languageFeather.wordLemmaCount print u"介词个数", languageFeather.prepositionCount print u"介词比例", languageFeather.prepositionRatio print u"介词使用", languageFeather.prepositionUse print u"定冠词个数", languageFeather.definiteArticleCount print u"定冠词比例", languageFeather.definiteArticleRatio print u"定冠词使用", languageFeather.definiteArticleUse # 提取结构特征 #structureFeather = e.extractStructureFeather(passage) #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt') print "...OVER"