def judge(self, fList, _1List): #  feature, 出現一次List
     chList = [no_abc123(tp[1]) for tp in fList]
     if self.predictone == self.tone:
         self.correctList1[0] = 1
         if any(k not in _1List for k in chList):
             self.correctList2[0] = 1
     else:
         self.correctList1[1] = 1
         if any(k not in _1List for k in chList):
             self.correctList2[1] = 1
         # 特殊錯誤
         if max(self.score)-self.score[self.toneNum] < sum(self.score)/5:
             self.spFault[0] = 1
         if 3 < self.rank:
             self.spFault[1] = 1
     return self.correctList1 + self.correctList2 + self.spFault  
def testing(readPath, writePath, readName):
    # user
    writeName = readName[:-10] + '(test).txt'
    writeNameRp1 = readName[:-10] + '(testRp1).txt'
    writeNameRp2 = readName[:-10] + '(testRp2).txt'
    ctsMethod = readName[:-10].split('_')[1]
    grabMethod = readName[:-10].split('_')[0].split('(')[0]
    posi = [int(readName[:-10].split('_')[2])]

    # initial
    trainingDict = {}
    _1List = []  # 只出現一次的詞
    cnt = 0
    v = [0, 0, 0]  # 正確
    x = [0, 0, 0]  # 錯誤
    noFind = 0
    v1 = [0, 0, 0]  # (不計入training只有一次的)
    x1 = [0, 0, 0]
    inappli = 0  # inapplicable

    # loading training data
    with open(os.path.join(readPath, readName), 'r') as fr:
        words = fr.readlines()
    for word in words:
        index = word.split()[0]
        freq = int(word.split()[1])
        score = word.split('[')[1].split(']')[0]
        score = score.split(', ')
        scoreList1 = [int(n) / freq for n in score]  # 比例
        scoreList2 = [int(n) for n in score]  # 次數
        scoreList3 = [round(math.log2(1 + n), 2) for n in scoreList2]  # 次數log
        trainingDict[index] = scoreList1, scoreList2, scoreList3
        if freq == 1:
            _1List.append(index)
    # syndict
    syn = syndict.syn_dict(trainingDict, dic=Dic)

    # 寫檔
    fw = open(os.path.join(writePath, writeName), 'w', encoding='utf8')
    fw_report1 = open(os.path.join(writePath, writeNameRp1),
                      'w',
                      encoding='utf8')
    fw_report2 = open(os.path.join(writePath, writeNameRp2),
                      'w',
                      encoding='utf8')
    # toneDict
    for fileNum in trainNum:
        with open(f'../data/input/{fileNum}out.txt', 'r',
                  encoding='utf8') as fr1:
            toneFile = fr1.readlines()
        toneDict = {
            toneFile[2 + 4 * k].split()[0]: toneFile[3 + 4 * k].split()[1]
            for k in range(int(len(toneFile) / 4))
        }
        # test
        with open(f'../data/grablist/input/{fileNum}{ctsMethod}.txt',
                  'r',
                  encoding='utf8') as fr2:
            lines = fr2.readlines()
        lines = lines[int(len(lines) * start / 10) +
                      1:int(len(lines) * end / 10)]  # 只取資料的前半

        for line in lines:
            ch = ''  # 初始化
            cnt += 1
            t_score1 = [0, 0, 0, 0, 0, 0, 0, 0]  # 分數計算初始化
            t_score2 = [0, 0, 0, 0, 0, 0, 0, 0]
            t_score3 = [0, 0, 0, 0, 0, 0, 0, 0]
            num = line.split()[0]  # @num
            st = line.split()[1]
            tone = toneDict.get(num)  # 正確讀音
            fw.write('\n' + fileNum.ljust(7) + num.ljust(7) + '\n' + st + '\n')
            content = grab.Grab(st)
            if grabMethod == 'nb':
                feature = content.nb(*posi)
            elif grabMethod == 'allhead':
                feature = content.all_head(*posi)
            elif grabMethod == 'treenb':
                layerNum = readName.split('(')[1].split(')')[0]
                feature = content.tree_nb(int(layerNum), *posi)
            # 若該方法沒有特徵詞
            if feature == []:
                noFind += 1
                fw.write('[]\n')
                fw.write('no feature'.ljust(50) + tone + '\n')  # 正確讀音
                fw.write('no feature'.ljust(50) + tone + '\n')  # 正確讀音
                fw.write('no feature'.ljust(50) + tone + '\n')  # 正確讀音
            else:
                fw.write(str(feature) + '\n')
                for tp in feature:  # tuple:(1, 'DET:一')7
                    ch = no_abc123(tp[1])
                    p = tp[0]
                    try:
                        sc1 = syn.get_score(ch,
                                            scoreNum=1,
                                            dictSwitch=dictswitch)
                        t_score1 = [
                            round(a + b, 2) for a, b in zip(sc1, t_score1)
                        ]
                        sc2 = syn.get_score(ch,
                                            scoreNum=2,
                                            dictSwitch=dictswitch)
                        t_score2 = [a + b for a, b in zip(sc2, t_score2)]
                        sc3 = syn.get_score(ch,
                                            scoreNum=3,
                                            dictSwitch=dictswitch)
                        t_score3 = [a + b for a, b in zip(sc3, t_score3)]
                    except:
                        pass
                # 判斷是否為inapplicable (全為0)
                if t_score1 == [0, 0, 0, 0, 0, 0, 0, 0]:
                    inappli += 1
                    fw.write('inapplicable'.ljust(50) + tone + '\n')
                    fw.write('inapplicable'.ljust(50) + tone + '\n')
                    fw.write('inapplicable'.ljust(50) + tone + '\n')
                    continue
                # 分數1
                s1 = scorecalcu.Score(t_score1, tone)
                correctList = s1.judge(feature, _1List)
                v[0] = v[0] + s1.correctList1[0]
                x[0] = x[0] + s1.correctList1[1]
                v1[0] = v1[0] + s1.correctList2[0]
                x1[0] = x1[0] + s1.correctList2[1]
                if s1.spFault[0] == 1:
                    fw_report1.write('s1 ' + fileNum.ljust(7) + num.ljust(7))
                    fw_report1.write(tone.ljust(8) +
                                     s1.predictone.ljust(8))  # 第一種錯誤紀錄
                    fw_report1.write(str(feature).ljust(30) + '\n')
                if s1.spFault[1] == 1:
                    fw_report2.write('s1 ' + fileNum.ljust(7) + num.ljust(7))
                    fw_report2.write(tone.ljust(8) +
                                     s1.predictone.ljust(8))  # 第二種錯誤紀錄
                    fw_report2.write(str(feature).ljust(30) + '\n')
                fw.write(
                    str(t_score1).ljust(50) + tone.ljust(8) + str(s1.rank) +
                    '/' + str(s1.allrank) + '\n')
                # 分數2
                s2 = scorecalcu.Score(t_score2, tone)
                correctList = s2.judge(feature, _1List)
                v[1] = v[1] + s2.correctList1[0]
                x[1] = x[1] + s2.correctList1[1]
                v1[1] = v1[1] + s2.correctList2[0]
                x1[1] = x1[1] + s2.correctList2[1]
                if s2.spFault[0] == 1:
                    fw_report1.write('s2 ' + fileNum.ljust(7) + num.ljust(7))
                    fw_report1.write(tone.ljust(8) +
                                     s2.predictone.ljust(8))  # 第一種錯誤紀錄
                    fw_report1.write(str(feature).ljust(30) + '\n')
                if s2.spFault[1] == 1:
                    fw_report2.write('s2 ' + fileNum.ljust(7) + num.ljust(7))
                    fw_report2.write(tone.ljust(8) +
                                     s2.predictone.ljust(8))  # 第二種錯誤紀錄
                    fw_report2.write(str(feature).ljust(30) + '\n')
                fw.write(
                    str(t_score2).ljust(50) + tone.ljust(8) + str(s2.rank) +
                    '/' + str(s2.allrank) + '\n')
                # 分數3
                s3 = scorecalcu.Score(t_score3, tone)
                correctList = s3.judge(feature, _1List)
                v[2] = v[2] + s3.correctList1[0]
                x[2] = x[2] + s3.correctList1[1]
                v1[2] = v1[2] + s3.correctList2[0]
                x1[2] = x1[2] + s3.correctList2[1]
                if s3.spFault[0] == 1:
                    fw_report1.write('s3 ' + fileNum.ljust(7) + num.ljust(7))
                    fw_report1.write(tone.ljust(8) +
                                     s3.predictone.ljust(8))  # 第一種錯誤紀錄
                    fw_report1.write(str(feature).ljust(30) + '\n')
                if s3.spFault[1] == 1:
                    fw_report2.write('s3 ' + fileNum.ljust(7) + num.ljust(7))
                    fw_report2.write(tone.ljust(8) +
                                     s3.predictone.ljust(8))  # 第二種錯誤紀錄
                    fw_report2.write(str(feature).ljust(30) + '\n')
                fw.write(
                    str(t_score3).ljust(50) + tone.ljust(8) + str(s3.rank) +
                    '/' + str(s3.allrank) + '\n')

    try:
        prec = [
            format(round(t / (t + f), 5) * 100, '.3f') for t, f in zip(v, x)
        ]  # 一次有算
        prec1 = [
            format(round(t / (t + f), 5) * 100, '.3f') for t, f in zip(v1, x1)
        ]  # 一次不算
    except ZeroDivisionError:
        prec, prec1 = [0, 0, 0], [0, 0, 0]
    applicRate = format(round((cnt - noFind - inappli) / cnt, 5) * 100, '.3f')

    fw.write(
        f'\n\n共{cnt}句, no feature:{noFind}, inapplicable:{inappli}, applicability:{applicRate}%\n'
    )
    fw.write(f'\nS1 正確:{v[0]}, 錯誤:{x[0]}, precision:{prec[0]}%\n')
    fw.write(f'不算詞頻為1的特徵詞 正確:{v1[0]} 錯誤:{x1[0]}  precision:{prec1[0]}%\n')
    fw.write(f'\nS2 正確:{v[1]}, 錯誤:{x[1]}, precision:{prec[1]}%\n')
    fw.write(f'不算詞頻為1的特徵詞 正確:{v1[1]} 錯誤:{x1[1]} precision:{prec1[1]}%\n')
    fw.write(f'\nS3 正確:{v[2]}, 錯誤:{x[2]}, precision:{prec[2]}%\n')
    fw.write(f'不算詞頻為1的特徵詞 正確:{v1[2]} 錯誤:{x1[2]} precision:{prec1[2]}%\n')
    fw.close()
    fw_report1.close()
    fw_report2.close()
    return cnt, noFind, inappli, applicRate, prec, prec1
    print('finish')
Example #3
0
def combinefiles(fileDatas, CoefTupleOfData):  # fileDatas is a list

    output_text = []  # texts of output
    spfault1 = []
    spfault2 = []
    v = [0, 0, 0]
    x = [0, 0, 0]
    noFind = 0
    inappli = 0

    def check_num_of_coef(fileDatas, CoefTupleOfData):
        if len(CoefTupleOfData) != len(fileDatas):
            raise TypeError('''Numbers of coefficients are not equal 
                            to number of file datas!''')

    # check lines numbers of file datas are equal
    def check_lineno(fileData):
        if len({len(fileData) for fileData in fileDatas}) > 1:
            raise TypeError('''amount of sentence in file1 and 2 is 
                            inequality!''')

    # 從string獲得score and tone
    def str2int_scoreList(string):
        if string.split()[0] == 'no':
            tone = string.split()[2]
            return [0, 0, 0, 0, 0, 0, 0, 0], tone
        elif string.split()[0] == 'inapplicable':
            tone = string.split()[1]
            return [0, 0, 0, 0, 0, 0, 0, 0], tone
        else:
            score = string.split('[')[1].split(']')[0].split(', ')
            score = [float(item) for item in score]
            tone = string.split('] ')[1].split()[0]  # correct ans
            return score, tone

    def sum_of_score(scoreList):
        sumList = [0, 0, 0, 0, 0, 0, 0, 0]
        for sc, r in zip(scoreList, CoefTupleOfData):
            temp = [n * r for n in sc]
            sumList = [a + b for a, b in zip(temp, sumList)]
        sumList = [round(s / sum(CoefTupleOfData), 2) for s in sumList]
        return sumList

    check_num_of_coef(fileDatas, CoefTupleOfData)
    check_lineno(fileDatas)
    amountOfLines = int(len(fileDatas[0]) / 7)
    dataSample = fileDatas[0]  # set data 1 as data sample

    for i in range(amountOfLines - 1):
        if dataSample[i * 7 + 1].split()[1][0] != '@':
            raise Exception('data start from not "@"!')

        output_text.append('\n')  # ' '
        output_text.append(dataSample[i * 7 + 1])  # '199x @oo'
        output_text.append(dataSample[i * 7 + 2])  # centence
        # feature
        feature = ''
        for fileData in fileDatas:
            feature = feature.join(fileData[i * 7 + 3][:-1])
        feature += '\n'
        output_text.append(feature)

        # score
        s1List = []  # load s1 of datas
        s2List = []  # load s2 of datas
        s3List = []  # load s3 of datas
        for fileData in fileDatas:
            tempScore, tone = str2int_scoreList(fileData[i * 7 + 4])
            s1List.append(tempScore)
            tempScore, tone = str2int_scoreList(fileData[i * 7 + 5])
            s2List.append(tempScore)
            tempScore, tone = str2int_scoreList(fileData[i * 7 + 6])
            s3List.append(tempScore)

        # judgment of no feature
        if no_abc123(feature) == '':
            noFind += 1
            output_text.append('no feature'.ljust(50) + tone.ljust(8) + '\n')
            output_text.append('no feature'.ljust(50) + tone.ljust(8) + '\n')
            output_text.append('no feature'.ljust(50) + tone.ljust(8) + '\n')
            continue

        sum1 = sum_of_score(s1List)
        sum2 = sum_of_score(s2List)
        sum3 = sum_of_score(s3List)

        # judgment of inapplibility
        if sum2 == [0, 0, 0, 0, 0, 0, 0, 0]:
            inappli += 1
            output_text.append('inapplicable'.ljust(50) + tone + '\n')
            output_text.append('inapplicable'.ljust(50) + tone + '\n')
            output_text.append('inapplicable'.ljust(50) + tone + '\n')
            continue

        s1 = score.Score(sum1, tone)
        s2 = score.Score(sum2, tone)
        s3 = score.Score(sum3, tone)
        v[0] = s1.judge2()[0] + v[0]
        v[1] = s2.judge2()[0] + v[1]
        v[2] = s3.judge2()[0] + v[2]
        x[0] = s1.judge2()[1] + x[0]
        x[1] = s2.judge2()[1] + x[1]
        x[2] = s3.judge2()[1] + x[2]
        if s1.judge2()[2] == 1:  # 特殊錯誤1
            spfault1.append('s1 ' + dataSample[i * 7 + 1][:-1] +
                            s1.tone.ljust(8) + s1.predictone.ljust(8) +
                            str(feature).ljust(30) + '\n')
        if s1.judge2()[3] == 1:  # 特殊錯誤2
            spfault2.append('s1 ' + dataSample[i * 7 + 1][:-1] +
                            s1.tone.ljust(8) + s1.predictone.ljust(8) +
                            str(feature).ljust(30) + '\n')
        if s2.judge2()[2] == 1:  # 特殊錯誤1
            spfault1.append('s2 ' + dataSample[i * 7 + 1][:-1] +
                            s2.tone.ljust(8) + s2.predictone.ljust(8) +
                            str(feature).ljust(30) + '\n')
        if s2.judge2()[3] == 1:  # 特殊錯誤2
            spfault2.append('s2 ' + dataSample[i * 7 + 1][:-1] +
                            s2.tone.ljust(8) + s2.predictone.ljust(8) +
                            str(feature).ljust(30) + '\n')
        if s3.judge2()[2] == 1:  # 特殊錯誤1
            spfault1.append('s2 ' + dataSample[i * 7 + 1][:-1] +
                            s3.tone.ljust(8) + s3.predictone.ljust(8) +
                            str(feature).ljust(30) + '\n')
        if s3.judge2()[3] == 1:  # 特殊錯誤2
            spfault2.append('s2 ' + dataSample[i * 7 + 1][:-1] +
                            s3.tone.ljust(8) + s3.predictone.ljust(8) +
                            str(feature).ljust(30) + '\n')
        # output
        output_text.append(
            str(s1.score).ljust(50) + tone.ljust(8) +
            f'{s1.rank}/{s1.allrank}\n')
        output_text.append(
            str(s2.score).ljust(50) + tone.ljust(8) +
            f'{s2.rank}/{s2.allrank}\n')
        output_text.append(
            str(s3.score).ljust(50) + tone.ljust(8) +
            f'{s3.rank}/{s3.allrank}\n')

    cnt = amountOfLines - 1
    try:
        prec = [
            format(round(t / (t + f), 5) * 100, '.3f') for t, f in zip(v, x)
        ]  # 一次有算
    except ZeroDivisionError:
        prec = [0, 0, 0]
    applicRate = format(round((cnt - noFind - inappli) / cnt, 5) * 100, '.3f')

    output_text.append(
        f'\n\n共{cnt}句, no feature:{noFind}, inapplicable:{inappli}, applicability:{applicRate}%\n'
    )
    output_text.append(f'\nS1 正確:{v[0]}, 錯誤:{x[0]}, precision:{prec[0]}%\n')
    output_text.append('')
    output_text.append(f'\nS2 正確:{v[1]}, 錯誤:{x[1]}, precision:{prec[1]}%\n')
    output_text.append('')
    output_text.append(f'\nS3 正確:{v[2]}, 錯誤:{x[2]}, precision:{prec[2]}%\n')
    output_text.append('')

    print('finish!!')
    return cnt, noFind, inappli, applicRate, prec, output_text, spfault1, spfault2
        num = line.split()[0]  # @num
        st = line.split()[1]
        content = grab.Grab(st)
        if grabMethod == grabMethodList[0]:
            feature = content.nb(*posi)
        elif grabMethod == grabMethodList[1]:
            feature = content.all_head(*posi)
        elif grabMethod == grabMethodList[2]:
            if layerNum == '':
                layerNum = input('Please input layerNum:')
            feature = content.tree_nb(int(layerNum), *posi)
            layerNumStr = '(' + str(layerNum) + ')'

        for tp in feature:  # tuple:(1, 'DET:一')7
            position = tp[0]
            ch = no_abc123(tp[1])
            dataDict[ch].append((fileNum, num, position))

    sortDataList = sorted(dataDict.items(),
                          key=lambda w: len(w[1]),
                          reverse=True)  # 排序字典

    with open(
            f'../data/grablist/output/{grabMethod}{layerNumStr}_{ctsMethod}_{posiName}.txt',
            'w',
            encoding='utf8') as fw:
        fw.write(str(posi) + '\n')
        for word in sortDataList:
            w = word[0]
            l = len_ch(w)  # 計算中文字數
            w = w.ljust(21 - l)