def predict(self, passages): # ᅩ£고ᅰ for p in passages: if not p.preprocessed: essayprepare.processPassage(p) p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # ᅧ¦뽀ᅰᅱᄉ f = open('fs_test.txt', 'w') # 뽀ᅰᅬ endog = [] exog = [] labels = [] for p in passages: score = int(p.score) if score < 35: score = 35 endog.append(score) x = self.__getFeatherList(p) exog.append(x) labels.append(p.label) f.write(p.id + ' ') f.write(p.score) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() p_label, p_acc, p_val = svmutil.svm_predict(labels, exog, self.svm_model) print p_label, p_acc, p_val
def rate_by_params(self, passage): # 线性预测 extractor = FeatherExtractor(None) if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = extractor.extractLangFeather(passage) passage.cf = extractor.extractContentFeather(passage) passage.sf = extractor.extractStructureFeather(passage) exog = [] x = self.__getFeatherList(passage) score = dot(x, self.model_params) passage.rateScore = score passage.endogScore = score # 调整分数 passage.filter_scores = [] filters = [self.tokenCountFilter, self.sentenceLengthAverageFilter, self.wordLengthAverageFilter, self.aclWordCountFilter, self.noneStopWordLengthAverageFilter, self.nounRatioFilter] for filter in filters: filter_score = filter(passage) passage.rateScore += filter_score passage.filter_scores.append(filter_score) passage.rated = True return [passage.rateScore]
def rate_by_params(self, passage): # 线性预测 extractor = FeatherExtractor(None) if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = extractor.extractLangFeather(passage) passage.cf = extractor.extractContentFeather(passage) passage.sf = extractor.extractStructureFeather(passage) exog = [] x = self.__getFeatherList(passage) score = dot(x, self.model_params) passage.rateScore = score passage.endogScore = score # 调整分数 passage.filter_scores = [] filters = [ self.tokenCountFilter, self.sentenceLengthAverageFilter, self.wordLengthAverageFilter, self.aclWordCountFilter, self.noneStopWordLengthAverageFilter, self.nounRatioFilter, self.total_score_filter ] for filter in filters: filter_score = filter(passage) passage.rateScore += filter_score passage.filter_scores.append(filter_score) passage.rated = True return [passage.rateScore]
def rate(self, passage): # 线性预测 if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = self.extractor.extractLangFeather(passage) passage.cf = self.extractor.extractContentFeather(passage) passage.sf = self.extractor.extractStructureFeather(passage) exog = [] x = self.__getFeatherList(passage) exog.append(x) exog = np.array(exog) endog = self.gls_model.predict(exog) passage.rateScore = endog[0] passage.endogScore = endog[0] passage.filters = [] # 调整分数 filter = self.tokenCountFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.sentenceLengthAverageFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.wordLengthAverageFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.aclWordCountFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.noneStopWordLengthAverageFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.nounRatioFilter(passage) passage.rateScore += filter passage.filters.append(filter) filter = self.verbRatioFilter(passage) #passage.rateScore += filter passage.filters.append(filter) filter = self.adjRatioFilter(passage) #passage.rateScore += filter passage.filters.append(filter) filter = self.posRatioFilter(passage) #passage.rateScore += filter passage.filters.append(filter) passage.rated = True endog[0] = passage.rateScore return [passage.rateScore]
def train(self, passages): # pre-process passage i = 1 for p in passages: print "=======================" print "Passage", i, p.id if not p.preprocessed: essayprepare.processPassage(p) i += 1 self.extractor = FeatherExtractor(None) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # save feathers f = open('fs_zhang_train.txt', 'w') for p in passages: x = self.__getFeatherList(p) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # generate feather vector endog = [] exog = [] for p in passages: score = int(p.score) endog.append(score) x = self.__getFeatherList(p) exog.append(x) # train model endog = np.array(endog) exog = np.array(exog) self.gls_model = sm.GLS(endog, exog) results = self.gls_model.fit() #print results.summary() print results.params
def demo_one_sentence(): # 文章 passage = EssayPassage() passage.passage = 'I am a students.' passage.title = 'title' passage.score = 5 passage.id = '1' passage.reviewerId = 3 passage.content = 'I am a students.' # 处理文章 essayprepare.processPassage(passage) extractor = FeatherExtractor() lf = extractor.extractLangFeather(passage) passage.lf = lf cf = extractor.extractContentFeather(passage) sf = extractor.extractStructureFeather(passage) print 'OK'
def rate(self, passage): # 线性预测 if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = self.extractor.extractLangFeather(passage) passage.cf = self.extractor.extractContentFeather(passage) passage.sf = self.extractor.extractStructureFeather(passage) passage.lsaScore = passage.cf.lsaScore passage.lsaSimilarity = passage.cf.lsaSimilarity passage.lsaSimilarityAll = passage.cf.lsaSimilarityAll exog = [] x = self.__getFeatherList(passage) exog.append(x) # for i, xx in enumerate(x): # x[i] -= self.m[i] exog = np.array(exog) # xxexog = dot(self.p, exog.transpose()) # endog = self.gls_model.predict(xxexog.transpose()) endog = self.gls_model.predict(exog) passage.rateScore = endog[0] passage.endogScore = endog[0] # 调整分数 passage.filter_scores = [] filters = [self.tokenCountFilter, self.sentenceLengthAverageFilter, self.wordLengthAverageFilter, self.aclWordCountFilter, self.noneStopWordLengthAverageFilter, self.nounRatioFilter, self.verbRatioFilter, self.adjRatioFilter, self.posRatioFilter, self.lsaFilter] for filter in filters: filter_score = filter(passage) passage.rateScore += filter_score passage.filter_scores.append(filter_score) self.generateRateResult(passage) passage.rated = True endog[0] = passage.rateScore return [passage.rateScore]
def generatePassageFeathers(passages, outFilename): f = open(outFilename, 'w') e = FeatherExtractor() i = 1 for p in passages: print "Passage ", i # 处理文章 essayprepare.processPassage(p) # 提取语言特征 languageFeather = e.extractLangFeather(p) p.lf = languageFeather # 提取结构特征 structureFeather = e.extractStructureFeather(p) p.sf = structureFeather f.write(p.id + ' ') f.write(str(p.score)) f.write(' ' + str(languageFeather)) f.write('\n') i += 1 f.close()
for e in essays: if e.id == "0092": essay = e break # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id passage.reviewerId = essay.reviewerId passage.content = essay.content # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 print "PASSAGE=========================================" print passage print passage.id print passage.title print passage.score print passage.passage print len(passage.paragraphs) print "PARAGRAPHS---------------------------------------" for para in passage.paragraphs: print para.paragraphNo print para.paragraph for sent in para.sentences: print sent.sentenceNo
def do_task(task): newpassage = EssayPassage() newpassage.passage = task['input']['content'] newpassage.orderId = task['id'] newpassage.score = 0 newpassage.processStatus = 0 try: essayprepare.processPassage(newpassage, fn_prepare_progress) fe = extractor.FeatherExtractor() lf = fe.extractLangFeather(newpassage) newpassage.lf = lf cf = fe.extractContentFeather(newpassage) newpassage.cf = cf sf = fe.extractStructureFeather(newpassage) newpassage.sf = sf newpassage.score = rater.rate_by_params(newpassage)[0] except: task['progress'] = -2 task['status'] = 'TUTERR' task['output'] = "" task['simple_output'] = "" task['detail_output'] = "" commit_task(task) return # 生成最终结果 output = {} passage = {} passage['score'] = newpassage.score passage['token_count'] = lf.tokenCount passage['word_count'] = lf.wordCount passage['word_type_count'] = lf.wordTypeCount passage['word_lemma_count'] = lf.wordLemmaCount passage['word_stem_count'] = lf.wordStemCount passage['average_word_length'] = lf.wordLengthAverage passage['average_sentence_length'] = lf.sentenceLengthAverage passage['overly_use_word_count'] = lf.overlyUseWordCount passage['paragraph_count'] = len(newpassage.paragraphs) passage['sentence_count'] = newpassage.sentenceCount passage['sentences'] = [] for para in newpassage.paragraphs: for sent in para.sentences: sentence = {} sentence['no'] = sent.sentenceNo sentence['para_no'] = para.paragraphNo sentence['original'] = sent.sentence sentence['score'] = 0 spell_errors = [] fs = [] for token in sent.tokens: if token.isSpellError: fs.append('<ESP>' + token.token + '</ESP>') spell_error = {} spell_error['token'] = token.token spell_error['lemma'] = token.lemma spell_error['suggest'] = token.candidates spell_error['start_at'] = token.startAt spell_error['end_at'] = token.endAt spell_errors.append(spell_error) else: fs.append(token.token) sentence['spell_errors'] = spell_errors sentence['marked'] = ' '.join(fs) sentence['lt_result'] = sent.ltCheckResults sentence['lg_result'] = sent.lgCheckResults sentence['links'] = [] passage['sentences'].append(sentence) output['passage'] = passage task['progress'] = 100 task['status'] = 'DONE' task['output'] = json.dumps(output) task['simple_output'] = json.dumps(output) task['detail_output'] = json.dumps(generate_detail_output(newpassage)) commit_task(task)
def processEssay(self): self.browser.clear() id = unicode(self.lineedit.text()) essay = self.essayDict.get(id) if not essay: self.browser.append("<font color=red>%s is not found!</font>" % id) return self.browser.append(essay.content) # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 self.browser.append("PASSAGE=========================================") self.browser.append(passage.id) #self.browser.append(passage.title) self.browser.append(passage.score) self.browser.append(passage.passage) self.browser.append(str(len(passage.paragraphs))) self.browser.append("PARAGRAPHS---------------------------------------") for para in passage.paragraphs: self.browser.append(str(para.paragraphNo)) self.browser.append(para.paragraph) for sent in para.sentences: self.browser.append(str(sent.sentenceNo)) self.browser.append(str(sent.paragraphSentenceNo)) self.browser.append(sent.sentence) tokens = [token.token for token in sent.tokens] tags = [token.pos for token in sent.tokens] lemmas = [token.lemma for token in sent.tokens] stems = [token.stem for token in sent.tokens] levels = [token.level for token in sent.tokens] nos = [token.tokenNo for token in sent.tokens] sentNos = [token.sentenceTokenNo for token in sent.tokens] paraNos = [token.paragraphTokenNo for token in sent.tokens] errorTokens = [token.token for token in sent.tokens if token.isSpellError] if not sent.canParsed: self.browser.append("<font color=red>SENTENCE ERROR</font>") self.browser.append("<font color=red>SPELLERROR %s</font>" % str(errorTokens)) self.browser.append(str(tokens)) self.browser.append(str(tags)) self.browser.append(str(lemmas)) self.browser.append(str(stems)) self.browser.append(str(levels)) self.browser.append(str(sentNos)) self.browser.append(str(paraNos)) self.browser.append(str(nos)) self.browser.append(str(sent.tokenCount)) self.browser.append(str(sent.wordCount)) self.browser.append(str(sent.realWordCount)) self.browser.append(u"三元词组" + ' ' + str(passage.trigrams)) e = FeatherExtractor() # 提取语言特征 languageFeather = e.extractLangFeather(passage) print u"词次总数", languageFeather.tokenCount print u"单词总数", languageFeather.wordCount print u"词形总数", languageFeather.wordTypeCount print u"词元总数", languageFeather.wordLemmaCount print u"介词个数", languageFeather.prepositionCount print u"介词比例", languageFeather.prepositionRatio print u"介词使用", languageFeather.prepositionUse print u"定冠词个数", languageFeather.definiteArticleCount print u"定冠词比例", languageFeather.definiteArticleRatio print u"定冠词使用", languageFeather.definiteArticleUse # 提取结构特征 #structureFeather = e.extractStructureFeather(passage) #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt') print "...OVER"
def processEssay(self): self.browser.clear() id = unicode(self.lineedit.text()) essay = self.essayDict.get(id) if not essay: self.browser.append("<font color=red>%s is not found!</font>" % id) return self.browser.append(essay.content) # 文章 passage = EssayPassage() passage.passage = essay.cleanContent() passage.title = essay.title passage.score = essay.score passage.id = essay.id # 处理文章 essayprepare.processPassage(passage) # 输出来看看是啥样子 self.browser.append("PASSAGE=========================================") self.browser.append(passage.id) #self.browser.append(passage.title) self.browser.append(passage.score) self.browser.append(passage.passage) self.browser.append(str(len(passage.paragraphs))) self.browser.append( "PARAGRAPHS---------------------------------------") for para in passage.paragraphs: self.browser.append(str(para.paragraphNo)) self.browser.append(para.paragraph) for sent in para.sentences: self.browser.append(str(sent.sentenceNo)) self.browser.append(str(sent.paragraphSentenceNo)) self.browser.append(sent.sentence) tokens = [token.token for token in sent.tokens] tags = [token.pos for token in sent.tokens] lemmas = [token.lemma for token in sent.tokens] stems = [token.stem for token in sent.tokens] levels = [token.level for token in sent.tokens] nos = [token.tokenNo for token in sent.tokens] sentNos = [token.sentenceTokenNo for token in sent.tokens] paraNos = [token.paragraphTokenNo for token in sent.tokens] errorTokens = [ token.token for token in sent.tokens if token.isSpellError ] if not sent.canParsed: self.browser.append( "<font color=red>SENTENCE ERROR</font>") self.browser.append("<font color=red>SPELLERROR %s</font>" % str(errorTokens)) self.browser.append(str(tokens)) self.browser.append(str(tags)) self.browser.append(str(lemmas)) self.browser.append(str(stems)) self.browser.append(str(levels)) self.browser.append(str(sentNos)) self.browser.append(str(paraNos)) self.browser.append(str(nos)) self.browser.append(str(sent.tokenCount)) self.browser.append(str(sent.wordCount)) self.browser.append(str(sent.realWordCount)) self.browser.append(u"三元词组" + ' ' + str(passage.trigrams)) e = FeatherExtractor() # 提取语言特征 languageFeather = e.extractLangFeather(passage) print u"词次总数", languageFeather.tokenCount print u"单词总数", languageFeather.wordCount print u"词形总数", languageFeather.wordTypeCount print u"词元总数", languageFeather.wordLemmaCount print u"介词个数", languageFeather.prepositionCount print u"介词比例", languageFeather.prepositionRatio print u"介词使用", languageFeather.prepositionUse print u"定冠词个数", languageFeather.definiteArticleCount print u"定冠词比例", languageFeather.definiteArticleRatio print u"定冠词使用", languageFeather.definiteArticleUse # 提取结构特征 #structureFeather = e.extractStructureFeather(passage) #generateUSTCFeathers('USTC2011Jan.txt', 'USTCFeathers_503.txt') print "...OVER"
def train(self, passages): # 预处理文章 i = 1 for p in passages: #print "Passage ", i # 处理文章 if not p.preprocessed: essayprepare.processPassage(p) i += 1 # 训练模型 passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True) model = EssayModel() model.train(passages) self.models['1'] = model #print model.triGramDicts # 提取特征 self.extractor = FeatherExtractor(model) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # 输出特征值 f = open('fs_train.txt', 'w') # 生成特征向量 endog = [] exog = [] labels = [] for p in passages: score = int(p.score) # if score > 90: score = 90 # if score < 35: score = 35 endog.append(score) x = self.__getFeatherList(p) exog.append(x) labels.append(p.label) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # SVM分类器训练 #self.svm_model = svmutil.svm_train(labels, exog, '-c 3') # 线性回归模型训练 endog = np.array(endog) exog = np.array(exog) # print endog # print exog # self.m = np.mean(exog,axis=0) # print self.m # # T, P, e_var = PCA_svd(exog) # print T # print P # print e_var # # r, c = P.shape # print r, c # for i in xrange(11, r): # for j in xrange(0, c): # P[i, j] = 0 # print P # self.p = P # # xexog = dot(P, exog.transpose()) # print xexog # print xexog.shape # # xxexog = xexog.transpose() self.gls_model = sm.GLS(endog, exog) self.gls_model.fit()
def rate(self, passage): # ᅬᅯᅯᄂᄇ¬ if not passage.preprocessed: essayprepare.processPassage(passage) passage.lf = self.extractor.extractLangFeather(passage) passage.cf = self.extractor.extractContentFeather(passage) passage.sf = self.extractor.extractStructureFeather(passage) exog = [] x = self.__getFeatherList(passage) exog.append(x) # for i, xx in enumerate(x): # x[i] -= self.m[i] exog = np.array(exog) # xxexog = dot(self.p, exog.transpose()) # endog = self.gls_model.predict(xxexog.transpose()) endog = self.gls_model.predict(exog) passage.rateScore = endog[0] passage.endogScore = endog[0] # 쉐위ᅧ� # ᄌ홰ᅣᅰᅡᅲᅱᅧ�쉐 if (passage.lf.tokenCount < 100): passage.rateScore *= 0.8 elif passage.lf.tokenCount < 120: passage.rateScore *= 0.9 # ᄌ허ᄑᄒᄒ¦ᄈᄂ쉐 filter = 0 slv = passage.lf.sentenceLengthAverage if (slv < 10): filter = (10 - slv) * 2 if filter > 6: filter = 6 elif slv > 23: filter = (slv - 23) * 3 if filter > 9: filter = 9 passage.rateScore -= filter # ᄌ허ᄑᄒᄡᅧᄈᄂ쉐 filter = 0 wlv = passage.lf.wordLengthAverage if wlv < 4: filter = (4 - wlv) * 10 passage.rateScore -= filter # ᄌ혀ᄉᄡᅧᅥᄑᄒᄈᄂᄊ쉐 filter = 0 rwlv = passage.lf.noneStopWordLengthAverage if rwlv < 5.5: filter = (5.5 - rwlv) * 10 passage.rateScore -= filter # ᄌᄒᄡᅧᅯᄆ�쉐 filter = 0 nr = passage.lf.nounRatio if nr < 0.2: filter = (0.2 - nr) * 100 elif nr > 0.35: filter = (nr - 0.35) * 100 passage.rateScore -= filter filter = 0 vr = passage.lf.verbRatio if vr < 0.1: filter = (0.1 - vr) * 200 elif vr > 0.2: filter = (vr - 0.2) * 200 passage.rateScore -= filter filter = 0 ar = passage.lf.adjRatio if ar < 0.045: filter = (0.045 - ar) * 500 passage.rateScore -= filter filter = 0 badRatioCount = 0 offsetRatio = 0 if (nr < 0.2) or (nr > 0.3): badRatioCount += 1 else: offsetRatio += abs(nr - 0.25) / 0.1 if (vr < 0.1) or (vr > 0.2): badRatioCount += 1 else: offsetRatio += abs(vr - 0.15) / 0.1 if (ar < 0.06) or (ar > 0.13): badRatioCount += 1 else: offsetRatio += abs(ar - 0.095) / 0.14 if badRatioCount == 0: if offsetRatio < 0.1: filter = passage.rateScore * 0.05 elif badRatioCount == 1: if offsetRatio > 0.6: filter = - passage.rateScore * 0.05 elif badRatioCount > 1: filter = - passage.rateScore * 0.02 * badRatioCount * badRatioCount passage.rateScore += filter passage.offsetRatio = offsetRatio # ᄌ햐ᅳᅬ¢ᅨᅥᄊ쉐 if (passage.cf.lsaScore > 75) and (passage.cf.lsaSimilarity > 89) and (passage.rateScore > 75): passage.rateScore += 5 if ((passage.cf.lsaScore < 70) and (passage.rateScore < 70)) and (passage.cf.lsaSimilarity > 89): passage.rateScore -=5 filter = 0 if ((passage.cf.lsaSimilarity <= 80) and (passage.cf.lsaSimilarity > 60)) or ((passage.cf.lsaSimilarityAll <= 56) and (passage.cf.lsaSimilarityAll > 32)): filter = (15 - abs(passage.cf.lsaSimilarity - 70) / 3.0) # if passage.rateScore < passage.cf.lsaScore: # passage.rateScore = passage.cf.lsaScore passage.rateScore += filter self.generateRateResult(passage) passage.rated = True endog[0] = passage.rateScore return [passage.rateScore]
def train(self, passages): # ᅯᄂᄡᆭ■ᅫᅣᅰᅡ i = 1 for p in passages: #print "Passage ", i # ᄡᆭ■ᅫᅣᅰᅡ if not p.preprocessed: essayprepare.processPassage(p) i += 1 # ᄉ앿ᅪ passages.sort(cmp=lambda x,y: cmp(x.score, y.score), reverse=True) model = EssayModel() model.train(passages) self.models['1'] = model #print model.triGramDicts # ᅩ£고ᅰ self.extractor = FeatherExtractor(model) for p in passages: p.lf = self.extractor.extractLangFeather(p) p.cf = self.extractor.extractContentFeather(p) p.sf = self.extractor.extractStructureFeather(p) # ᅧ¦뽀ᅰᅱᄉ f = open('fs_train.txt', 'w') # 뽀ᅰᅬ endog = [] exog = [] labels = [] for p in passages: score = int(p.score) #if score > 95: score = 95 if score < 40: score = 40 endog.append(score) x = self.__getFeatherList(p) exog.append(x) labels.append(p.label) f.write(p.id + ' ') f.write(str(p.score)) for v in x: f.write(' ' + str(v)) f.write('\n') f.close() # SVM위¢ᅥᄉᄋ #self.svm_model = svmutil.svm_train(labels, exog, '-c 3') # ᅬᅯᄏᄍ←ᅣᆪᅪᄉᄋ endog = np.array(endog) exog = np.array(exog) # print endog # print exog # self.m = np.mean(exog,axis=0) # print self.m # # T, P, e_var = PCA_svd(exog) # print T # print P # print e_var # # r, c = P.shape # print r, c # for i in xrange(11, r): # for j in xrange(0, c): # P[i, j] = 0 # print P # self.p = P # # xexog = dot(P, exog.transpose()) # print xexog # print xexog.shape # # xxexog = xexog.transpose() self.gls_model = sm.GLS(endog, exog) self.gls_model.fit()