def judge(self, fList, _1List): # feature, 出現一次List chList = [no_abc123(tp[1]) for tp in fList] if self.predictone == self.tone: self.correctList1[0] = 1 if any(k not in _1List for k in chList): self.correctList2[0] = 1 else: self.correctList1[1] = 1 if any(k not in _1List for k in chList): self.correctList2[1] = 1 # 特殊錯誤 if max(self.score)-self.score[self.toneNum] < sum(self.score)/5: self.spFault[0] = 1 if 3 < self.rank: self.spFault[1] = 1 return self.correctList1 + self.correctList2 + self.spFault
def testing(readPath, writePath, readName): # user writeName = readName[:-10] + '(test).txt' writeNameRp1 = readName[:-10] + '(testRp1).txt' writeNameRp2 = readName[:-10] + '(testRp2).txt' ctsMethod = readName[:-10].split('_')[1] grabMethod = readName[:-10].split('_')[0].split('(')[0] posi = [int(readName[:-10].split('_')[2])] # initial trainingDict = {} _1List = [] # 只出現一次的詞 cnt = 0 v = [0, 0, 0] # 正確 x = [0, 0, 0] # 錯誤 noFind = 0 v1 = [0, 0, 0] # (不計入training只有一次的) x1 = [0, 0, 0] inappli = 0 # inapplicable # loading training data with open(os.path.join(readPath, readName), 'r') as fr: words = fr.readlines() for word in words: index = word.split()[0] freq = int(word.split()[1]) score = word.split('[')[1].split(']')[0] score = score.split(', ') scoreList1 = [int(n) / freq for n in score] # 比例 scoreList2 = [int(n) for n in score] # 次數 scoreList3 = [round(math.log2(1 + n), 2) for n in scoreList2] # 次數log trainingDict[index] = scoreList1, scoreList2, scoreList3 if freq == 1: _1List.append(index) # syndict syn = syndict.syn_dict(trainingDict, dic=Dic) # 寫檔 fw = open(os.path.join(writePath, writeName), 'w', encoding='utf8') fw_report1 = open(os.path.join(writePath, writeNameRp1), 'w', encoding='utf8') fw_report2 = open(os.path.join(writePath, writeNameRp2), 'w', encoding='utf8') # toneDict for fileNum in trainNum: with open(f'../data/input/{fileNum}out.txt', 'r', encoding='utf8') as fr1: toneFile = fr1.readlines() toneDict = { toneFile[2 + 4 * k].split()[0]: toneFile[3 + 4 * k].split()[1] for k in range(int(len(toneFile) / 4)) } # test with open(f'../data/grablist/input/{fileNum}{ctsMethod}.txt', 'r', encoding='utf8') as fr2: lines = fr2.readlines() lines = lines[int(len(lines) * start / 10) + 1:int(len(lines) * end / 10)] # 只取資料的前半 for line in lines: ch = '' # 初始化 cnt += 1 t_score1 = [0, 0, 0, 0, 0, 0, 0, 0] # 分數計算初始化 t_score2 = [0, 0, 0, 0, 0, 0, 0, 0] t_score3 = [0, 0, 0, 0, 0, 0, 0, 0] num = line.split()[0] # @num st = line.split()[1] tone = toneDict.get(num) # 正確讀音 fw.write('\n' + fileNum.ljust(7) + num.ljust(7) + '\n' + st + '\n') content = grab.Grab(st) if grabMethod == 'nb': feature = content.nb(*posi) elif grabMethod == 'allhead': feature = content.all_head(*posi) elif grabMethod == 'treenb': layerNum = readName.split('(')[1].split(')')[0] feature = content.tree_nb(int(layerNum), *posi) # 若該方法沒有特徵詞 if feature == []: noFind += 1 fw.write('[]\n') fw.write('no feature'.ljust(50) + tone + '\n') # 正確讀音 fw.write('no feature'.ljust(50) + tone + '\n') # 正確讀音 fw.write('no feature'.ljust(50) + tone + '\n') # 正確讀音 else: fw.write(str(feature) + '\n') for tp in feature: # tuple:(1, 'DET:一')7 ch = no_abc123(tp[1]) p = tp[0] try: sc1 = syn.get_score(ch, scoreNum=1, dictSwitch=dictswitch) t_score1 = [ round(a + b, 2) for a, b in zip(sc1, t_score1) ] sc2 = syn.get_score(ch, scoreNum=2, dictSwitch=dictswitch) t_score2 = [a + b for a, b in zip(sc2, t_score2)] sc3 = syn.get_score(ch, scoreNum=3, dictSwitch=dictswitch) t_score3 = [a + b for a, b in zip(sc3, t_score3)] except: pass # 判斷是否為inapplicable (全為0) if t_score1 == [0, 0, 0, 0, 0, 0, 0, 0]: inappli += 1 fw.write('inapplicable'.ljust(50) + tone + '\n') fw.write('inapplicable'.ljust(50) + tone + '\n') fw.write('inapplicable'.ljust(50) + tone + '\n') continue # 分數1 s1 = scorecalcu.Score(t_score1, tone) correctList = s1.judge(feature, _1List) v[0] = v[0] + s1.correctList1[0] x[0] = x[0] + s1.correctList1[1] v1[0] = v1[0] + s1.correctList2[0] x1[0] = x1[0] + s1.correctList2[1] if s1.spFault[0] == 1: fw_report1.write('s1 ' + fileNum.ljust(7) + num.ljust(7)) fw_report1.write(tone.ljust(8) + s1.predictone.ljust(8)) # 第一種錯誤紀錄 fw_report1.write(str(feature).ljust(30) + '\n') if s1.spFault[1] == 1: fw_report2.write('s1 ' + fileNum.ljust(7) + num.ljust(7)) fw_report2.write(tone.ljust(8) + s1.predictone.ljust(8)) # 第二種錯誤紀錄 fw_report2.write(str(feature).ljust(30) + '\n') fw.write( str(t_score1).ljust(50) + tone.ljust(8) + str(s1.rank) + '/' + str(s1.allrank) + '\n') # 分數2 s2 = scorecalcu.Score(t_score2, tone) correctList = s2.judge(feature, _1List) v[1] = v[1] + s2.correctList1[0] x[1] = x[1] + s2.correctList1[1] v1[1] = v1[1] + s2.correctList2[0] x1[1] = x1[1] + s2.correctList2[1] if s2.spFault[0] == 1: fw_report1.write('s2 ' + fileNum.ljust(7) + num.ljust(7)) fw_report1.write(tone.ljust(8) + s2.predictone.ljust(8)) # 第一種錯誤紀錄 fw_report1.write(str(feature).ljust(30) + '\n') if s2.spFault[1] == 1: fw_report2.write('s2 ' + fileNum.ljust(7) + num.ljust(7)) fw_report2.write(tone.ljust(8) + s2.predictone.ljust(8)) # 第二種錯誤紀錄 fw_report2.write(str(feature).ljust(30) + '\n') fw.write( str(t_score2).ljust(50) + tone.ljust(8) + str(s2.rank) + '/' + str(s2.allrank) + '\n') # 分數3 s3 = scorecalcu.Score(t_score3, tone) correctList = s3.judge(feature, _1List) v[2] = v[2] + s3.correctList1[0] x[2] = x[2] + s3.correctList1[1] v1[2] = v1[2] + s3.correctList2[0] x1[2] = x1[2] + s3.correctList2[1] if s3.spFault[0] == 1: fw_report1.write('s3 ' + fileNum.ljust(7) + num.ljust(7)) fw_report1.write(tone.ljust(8) + s3.predictone.ljust(8)) # 第一種錯誤紀錄 fw_report1.write(str(feature).ljust(30) + '\n') if s3.spFault[1] == 1: fw_report2.write('s3 ' + fileNum.ljust(7) + num.ljust(7)) fw_report2.write(tone.ljust(8) + s3.predictone.ljust(8)) # 第二種錯誤紀錄 fw_report2.write(str(feature).ljust(30) + '\n') fw.write( str(t_score3).ljust(50) + tone.ljust(8) + str(s3.rank) + '/' + str(s3.allrank) + '\n') try: prec = [ format(round(t / (t + f), 5) * 100, '.3f') for t, f in zip(v, x) ] # 一次有算 prec1 = [ format(round(t / (t + f), 5) * 100, '.3f') for t, f in zip(v1, x1) ] # 一次不算 except ZeroDivisionError: prec, prec1 = [0, 0, 0], [0, 0, 0] applicRate = format(round((cnt - noFind - inappli) / cnt, 5) * 100, '.3f') fw.write( f'\n\n共{cnt}句, no feature:{noFind}, inapplicable:{inappli}, applicability:{applicRate}%\n' ) fw.write(f'\nS1 正確:{v[0]}, 錯誤:{x[0]}, precision:{prec[0]}%\n') fw.write(f'不算詞頻為1的特徵詞 正確:{v1[0]} 錯誤:{x1[0]} precision:{prec1[0]}%\n') fw.write(f'\nS2 正確:{v[1]}, 錯誤:{x[1]}, precision:{prec[1]}%\n') fw.write(f'不算詞頻為1的特徵詞 正確:{v1[1]} 錯誤:{x1[1]} precision:{prec1[1]}%\n') fw.write(f'\nS3 正確:{v[2]}, 錯誤:{x[2]}, precision:{prec[2]}%\n') fw.write(f'不算詞頻為1的特徵詞 正確:{v1[2]} 錯誤:{x1[2]} precision:{prec1[2]}%\n') fw.close() fw_report1.close() fw_report2.close() return cnt, noFind, inappli, applicRate, prec, prec1 print('finish')
def combinefiles(fileDatas, CoefTupleOfData): # fileDatas is a list output_text = [] # texts of output spfault1 = [] spfault2 = [] v = [0, 0, 0] x = [0, 0, 0] noFind = 0 inappli = 0 def check_num_of_coef(fileDatas, CoefTupleOfData): if len(CoefTupleOfData) != len(fileDatas): raise TypeError('''Numbers of coefficients are not equal to number of file datas!''') # check lines numbers of file datas are equal def check_lineno(fileData): if len({len(fileData) for fileData in fileDatas}) > 1: raise TypeError('''amount of sentence in file1 and 2 is inequality!''') # 從string獲得score and tone def str2int_scoreList(string): if string.split()[0] == 'no': tone = string.split()[2] return [0, 0, 0, 0, 0, 0, 0, 0], tone elif string.split()[0] == 'inapplicable': tone = string.split()[1] return [0, 0, 0, 0, 0, 0, 0, 0], tone else: score = string.split('[')[1].split(']')[0].split(', ') score = [float(item) for item in score] tone = string.split('] ')[1].split()[0] # correct ans return score, tone def sum_of_score(scoreList): sumList = [0, 0, 0, 0, 0, 0, 0, 0] for sc, r in zip(scoreList, CoefTupleOfData): temp = [n * r for n in sc] sumList = [a + b for a, b in zip(temp, sumList)] sumList = [round(s / sum(CoefTupleOfData), 2) for s in sumList] return sumList check_num_of_coef(fileDatas, CoefTupleOfData) check_lineno(fileDatas) amountOfLines = int(len(fileDatas[0]) / 7) dataSample = fileDatas[0] # set data 1 as data sample for i in range(amountOfLines - 1): if dataSample[i * 7 + 1].split()[1][0] != '@': raise Exception('data start from not "@"!') output_text.append('\n') # ' ' output_text.append(dataSample[i * 7 + 1]) # '199x @oo' output_text.append(dataSample[i * 7 + 2]) # centence # feature feature = '' for fileData in fileDatas: feature = feature.join(fileData[i * 7 + 3][:-1]) feature += '\n' output_text.append(feature) # score s1List = [] # load s1 of datas s2List = [] # load s2 of datas s3List = [] # load s3 of datas for fileData in fileDatas: tempScore, tone = str2int_scoreList(fileData[i * 7 + 4]) s1List.append(tempScore) tempScore, tone = str2int_scoreList(fileData[i * 7 + 5]) s2List.append(tempScore) tempScore, tone = str2int_scoreList(fileData[i * 7 + 6]) s3List.append(tempScore) # judgment of no feature if no_abc123(feature) == '': noFind += 1 output_text.append('no feature'.ljust(50) + tone.ljust(8) + '\n') output_text.append('no feature'.ljust(50) + tone.ljust(8) + '\n') output_text.append('no feature'.ljust(50) + tone.ljust(8) + '\n') continue sum1 = sum_of_score(s1List) sum2 = sum_of_score(s2List) sum3 = sum_of_score(s3List) # judgment of inapplibility if sum2 == [0, 0, 0, 0, 0, 0, 0, 0]: inappli += 1 output_text.append('inapplicable'.ljust(50) + tone + '\n') output_text.append('inapplicable'.ljust(50) + tone + '\n') output_text.append('inapplicable'.ljust(50) + tone + '\n') continue s1 = score.Score(sum1, tone) s2 = score.Score(sum2, tone) s3 = score.Score(sum3, tone) v[0] = s1.judge2()[0] + v[0] v[1] = s2.judge2()[0] + v[1] v[2] = s3.judge2()[0] + v[2] x[0] = s1.judge2()[1] + x[0] x[1] = s2.judge2()[1] + x[1] x[2] = s3.judge2()[1] + x[2] if s1.judge2()[2] == 1: # 特殊錯誤1 spfault1.append('s1 ' + dataSample[i * 7 + 1][:-1] + s1.tone.ljust(8) + s1.predictone.ljust(8) + str(feature).ljust(30) + '\n') if s1.judge2()[3] == 1: # 特殊錯誤2 spfault2.append('s1 ' + dataSample[i * 7 + 1][:-1] + s1.tone.ljust(8) + s1.predictone.ljust(8) + str(feature).ljust(30) + '\n') if s2.judge2()[2] == 1: # 特殊錯誤1 spfault1.append('s2 ' + dataSample[i * 7 + 1][:-1] + s2.tone.ljust(8) + s2.predictone.ljust(8) + str(feature).ljust(30) + '\n') if s2.judge2()[3] == 1: # 特殊錯誤2 spfault2.append('s2 ' + dataSample[i * 7 + 1][:-1] + s2.tone.ljust(8) + s2.predictone.ljust(8) + str(feature).ljust(30) + '\n') if s3.judge2()[2] == 1: # 特殊錯誤1 spfault1.append('s2 ' + dataSample[i * 7 + 1][:-1] + s3.tone.ljust(8) + s3.predictone.ljust(8) + str(feature).ljust(30) + '\n') if s3.judge2()[3] == 1: # 特殊錯誤2 spfault2.append('s2 ' + dataSample[i * 7 + 1][:-1] + s3.tone.ljust(8) + s3.predictone.ljust(8) + str(feature).ljust(30) + '\n') # output output_text.append( str(s1.score).ljust(50) + tone.ljust(8) + f'{s1.rank}/{s1.allrank}\n') output_text.append( str(s2.score).ljust(50) + tone.ljust(8) + f'{s2.rank}/{s2.allrank}\n') output_text.append( str(s3.score).ljust(50) + tone.ljust(8) + f'{s3.rank}/{s3.allrank}\n') cnt = amountOfLines - 1 try: prec = [ format(round(t / (t + f), 5) * 100, '.3f') for t, f in zip(v, x) ] # 一次有算 except ZeroDivisionError: prec = [0, 0, 0] applicRate = format(round((cnt - noFind - inappli) / cnt, 5) * 100, '.3f') output_text.append( f'\n\n共{cnt}句, no feature:{noFind}, inapplicable:{inappli}, applicability:{applicRate}%\n' ) output_text.append(f'\nS1 正確:{v[0]}, 錯誤:{x[0]}, precision:{prec[0]}%\n') output_text.append('') output_text.append(f'\nS2 正確:{v[1]}, 錯誤:{x[1]}, precision:{prec[1]}%\n') output_text.append('') output_text.append(f'\nS3 正確:{v[2]}, 錯誤:{x[2]}, precision:{prec[2]}%\n') output_text.append('') print('finish!!') return cnt, noFind, inappli, applicRate, prec, output_text, spfault1, spfault2
num = line.split()[0] # @num st = line.split()[1] content = grab.Grab(st) if grabMethod == grabMethodList[0]: feature = content.nb(*posi) elif grabMethod == grabMethodList[1]: feature = content.all_head(*posi) elif grabMethod == grabMethodList[2]: if layerNum == '': layerNum = input('Please input layerNum:') feature = content.tree_nb(int(layerNum), *posi) layerNumStr = '(' + str(layerNum) + ')' for tp in feature: # tuple:(1, 'DET:一')7 position = tp[0] ch = no_abc123(tp[1]) dataDict[ch].append((fileNum, num, position)) sortDataList = sorted(dataDict.items(), key=lambda w: len(w[1]), reverse=True) # 排序字典 with open( f'../data/grablist/output/{grabMethod}{layerNumStr}_{ctsMethod}_{posiName}.txt', 'w', encoding='utf8') as fw: fw.write(str(posi) + '\n') for word in sortDataList: w = word[0] l = len_ch(w) # 計算中文字數 w = w.ljust(21 - l)