Exemple #1
0
class DataProcess:
    """
        self.qu : 分完词的问题列表,每个词是个元组,包括词和词性
        self.wr_ans : 分完词的错误答案列表
        self.cor_ans : 分完词的正确答案列表

        self.qu_vec : 词向量
        self.cor_ans_vec : 正确答案词向量列表
        self.wr_ans_vec : 错误答案词向量列表
    """
    def __init__(self,
                 seg_type='jieba',
                 data=DEV_DATA,
                 fill='NULL',
                 needfill=True):
        self.base_fname = data.split('.')[0]
        self.word_file = data.split('.')[0] + '.word'
        self.bin_file = data.split('.')[0] + '.bin'
        self.seg_file = data.split('.')[0] + '.seg'
        self.vec_file = data.split('.')[0] + '.vec'

        self.seg = Segmentation(seg_type)
        self.dr = DataReader(data)
        self.dr.filt()

        self.qu = []
        self.wr_ans = []
        self.cor_ans = []

        self.qu_vec = []
        self.wr_ans_vec = []
        self.cor_ans_vec = []

        self.max_len = 0
        self.fill_len = 0
        self.fill = fill
        # self.needfill = needfill

        self.freq = [0] * 1050

        self.stop_word = self._load_stop_word()

    def _load_stop_word(self):
        lst = []
        with open(STOP_WORD, encoding='utf-8') as f:
            while True:
                line = f.readline()
                if not line:
                    break
                lst.append(line.split('\n')[0])
        return set(lst)

    def seg_word(self):
        print('%.2f:开始分词...' % (time.time() - now))
        with open(self.seg_file, 'w', encoding='utf-8') as fout:
            for i in range(len(self.dr)):
                if self.seg.type == 'yuyanyun':
                    time.sleep(0.0051)
                qu = self.seg.segment(self.dr.question[i])
                self.qu.append(qu)
                cor = [
                    self.seg.segment(self.dr.cor_ans[i][j])
                    for j in range(len(self.dr.cor_ans[i]))
                ]
                self.cor_ans.append(cor)
                wr = [
                    self.seg.segment(self.dr.wr_ans[i][j])
                    for j in range(len(self.dr.wr_ans[i]))
                ]
                self.wr_ans.append(wr)
                fout.write('%s,%s,%s\n' % (str(qu), str(cor), str(wr)))
        print('%.2f:分词成功' % (time.time() - now))

    def load_seg(self):
        with open(self.seg_file, encoding='utf-8') as fin:
            while True:
                line = fin.readline()
                if not line:
                    break
                qu, cor, wr = eval(line)
                lst = [len(qu)]
                self.freq[len(qu)] += 1
                for i in cor:
                    lst.append(len(i))
                    self.freq[len(i)] += 1
                self.qu.append(qu)
                self.cor_ans.append(cor)
                self.wr_ans.append(wr)
                self.max_len = max(max(lst), self.max_len)
        self.fill_len = self.get_fill_length()

    def get_fill_length(self):
        s = sum(self.freq)
        t = 0
        for i in range(len(self.freq)):
            t += self.freq[i]
            if t / s > 0.98:
                return i

    def save_word(self):
        with open(self.word_file, 'w', encoding='utf-8') as f:
            for i in range(len(self.qu)):
                lst = [w[0] for w in self.qu[i]]
                if len(lst) > self.fill_len:
                    lst = lst[:self.fill_len]
                else:
                    lst = lst + [self.fill] * (self.fill_len - len(lst))

                for j in range(len(self.wr_ans[i])):
                    tmp = [w[0] for w in self.wr_ans[i][j]]
                    if len(tmp) > self.fill_len:
                        tmp = tmp[:self.fill_len]
                    else:
                        tmp += [self.fill] * (self.fill_len - len(tmp))
                    lst += tmp
                for j in range(len(self.cor_ans[i])):
                    tmp = [w[0] for w in self.cor_ans[i][j]]
                    if len(tmp) > self.fill_len:
                        tmp = tmp[:self.fill_len]
                    else:
                        tmp += [self.fill] * (self.fill_len - len(tmp))
                    lst += tmp
                f.write('%s\n' % ('\n'.join(lst)))

    def word_vector(self):
        print('%.2f:开始转化词向量' % (time.time() - now))
        word2vec.word2vec(self.word_file,
                          self.bin_file,
                          binary=1,
                          verbose=False)
        print('%.2f:词向量转化完成' % (time.time() - now))

    def gene_wordvec(self):
        print('%.2f:开始填充词向量...' % (time.time() - now))
        model = word2vec.load(self.bin_file)
        # f = open(self.vec_file,'w')
        for i in range(len(self.qu)):
            cur = time.time()
            qlst = []
            for j in range(min(len(self.qu[i]), self.fill_len)):
                if self.qu[i][j][0] in model:
                    qlst.append(model[self.qu[i][j][0]])
                else:
                    qlst.append(model[self.fill])
            qlst += [model[self.fill]] * (self.fill_len - len(self.qu[i]))
            self.qu_vec.append(qlst)

            clst = []
            for j in range(len(self.cor_ans[i])):
                lst = []
                for k in range(min(len(self.cor_ans[i][j]), self.fill_len)):
                    if self.cor_ans[i][j][k][0] in model:
                        lst.append(model[self.cor_ans[i][j][k][0]])
                    else:
                        lst.append(model[self.fill])
                lst += [model[self.fill]
                        ] * (self.fill_len - len(self.cor_ans[i][j]))
                clst.append(lst)
            self.cor_ans_vec.append(clst)

            wlst = []
            for j in range(len(self.wr_ans[i])):
                lst = []
                for k in range(min(len(self.wr_ans[i][j]), self.fill_len)):
                    if self.wr_ans[i][j][k][0] in model:
                        lst.append(model[self.wr_ans[i][j][k][0]])
                    else:
                        lst.append(model[self.fill])
                lst += [model[self.fill]
                        ] * (self.fill_len - len(self.wr_ans[i][j]))
                wlst.append(lst)
            self.wr_ans_vec.append(wlst)
            # f.write('%s,%s,%s\n'%(str(qlst), str(clst), str(wlst)))

            res = '\r%.2f' % (i / len(self.qu) * 100)
            sys.stdout.write(res)
        # f.close()
        print('%.2f:词向量填充完成,写入文件完成' % (time.time() - now))

    def fix(self, qu, ans):
        qus = ''.join([i[0] for i in qu])
        keys = textrank(qus)
        weis = [1 / i for i in range(1, len(keys) + 1)]
        m = 0
        for pa in ans:
            wei = 0
            lst = [i[0] for i in pa]
            for i in range(len(keys)):
                wei += weis[i] * (0 if keys[i] in lst else 1)
            m = max(m, wei)
        return m

    def test(self):
        total = len(self.qu)
        count = 0
        for i in range(len(self.qu)):
            cor_max = self.fix(self.qu[i], self.cor_ans[i])
            wr_max = self.fix(self.qu[i], self.wr_ans[i])

            if wr_max < cor_max:
                count += 1
        print(count / total)

    def fill_data(self):
        with open(self.word_file, 'w', encoding='utf-8') as out:
            for i in range(len(self.qu)):
                if len(self.qu[i]) < self.fill_len:
                    self.qu[i] += [(self.fill, 'null')
                                   ] * (self.fill_len - len(self.qu[i]))
                for j in range(len(self.qu[i])):
                    out.write('%s\n' % self.qu[i][j][0])
            for i in range(len(self.wr_ans)):
                for j in range(len(self.wr_ans[i])):
                    if len(self.wr_ans[i][j]) < self.fill_len:
                        self.wr_ans[i][j] += [
                            (self.fill, 'null')
                        ] * (self.fill_len - len(self.wr_ans[i][j]))
                    for k in range(len(self.wr_ans[i][j])):
                        out.write('%s\n' % self.wr_ans[i][j][k][0])
            for i in range(len(self.cor_ans)):
                for j in range(len(self.cor_ans[i])):
                    if len(self.cor_ans[i][j]) < self.fill_len:
                        self.cor_ans[i][j] += [
                            (self.fill, 'null')
                        ] * (self.fill_len - len(self.cor_ans[i][j]))
                    for k in range(len(self.cor_ans[i][j])):
                        out.write('%s\n' % self.cor_ans[i][j][k][0])
Exemple #2
0
from aip import AipNlp
from DataReader import DataReader
import time
import json
import re

AIP_ID = 'c68c52f38cf1454cbea41543450fc4c2'
API_KEY = 'G9hCennFyiZReL9uoGcAmwYD'
SECRET_KEY = '9DAuD67SV3vClE9hN92b4SHIGyUzdMDG'

aip = AipNlp(AIP_ID, API_KEY, SECRET_KEY)

dr = DataReader('data/develop.data')
dr.filt()

# s = re.sub('\'', '\"',"{'a':'b'}")
# res = json.loads(s, encoding='utf-8')
# print(res['a'])
#
cor_count = wr_count = 0
for i in range(len(dr.question)):
    wr_max = 0
    for j in range(len(dr.wr_ans[i])):
        time.sleep(0.1)
        res = eval(str(aip.simnet(dr.question[i], dr.wr_ans[i][j])))
        wr_max = max(res['score'], wr_max)
    time.sleep(0.1)
    res = eval(str(aip.simnet(dr.question[i], dr.cor_ans[i][0])))
    if res['score'] < wr_max:
        wr_count += 1
    else: