Ejemplo n.º 1
0
    def match(title, shorten_content, answer, question):
        if utils.is_zh_or_en(title):
            title_list = utils.split_word_zh(title) + ['。']
        else:
            title_list = utils.split_word_en(title) + ['.']

        if utils.is_zh_or_en(shorten_content):
            content_list = utils.split_word_zh(shorten_content)
        else:
            content_list = utils.split_word_en(shorten_content)

        merge_list = title_list + content_list
        merge_len = len(merge_list)

        if utils.is_zh_or_en(question):
            answer_list = utils.split_word_zh(answer)
            question_list = utils.split_word_zh(question)
            question_str = ' '.join(question_list)
        else:
            answer_list = utils.split_word_en(answer)
            question_list = utils.split_word_en(question)
            question_str = ' '.join(question_list)

        answer_len = len(answer_list)

        start = []
        end = []
        if answer == '':
            return -1, -1
        for i in range(0, merge_len - answer_len + 1):
            if merge_list[i:i + answer_len] == answer_list:
                start.append(i)
                end.append(i + answer_len - 1)
        if len(start) == 0:
            return -1, -1
        elif len(start) == 1:
            return start[0], end[0]
        else:
            scores = []
            # 前后扩展5个词
            for s, e in zip(start, end):
                s = max(s - 5, 0)
                answer_can = ' '.join(merge_list[s:e + 5])
                score = rouge.get_scores(answer_can, question_str,
                                         avg=True)['rouge-l']['r']
                scores.append(score)
            max_idx = np.argmax(scores)
            return start[max_idx], end[max_idx]
Ejemplo n.º 2
0
def build_vocab_embedding(list_df, vocab_path, embedding_in_zh,
                          embedding_in_en, embedding_out):
    data = []
    for df in list_df:
        if 'answer' in df:
            data += df[['title', 'content', 'question',
                        'answer']].values.flatten().tolist()
        else:
            data += df[['title', 'content',
                        'question']].values.flatten().tolist()

    vocab = set()
    for d in data:
        if utils.is_zh_or_en(d):
            d_list = utils.split_word_zh(d)
        else:
            d_list = utils.split_word_en(d)
        for dd in d_list:
            vocab.add(dd)
    print('data, word_num:%d' % len(vocab))

    # zh
    model_zh = gensim.models.KeyedVectors.load_word2vec_format(embedding_in_zh)
    # en
    model_en = gensim.models.KeyedVectors.load_word2vec_format(embedding_in_en)

    tmp = set()
    for word in vocab:
        if word in model_zh or word in model_en:
            tmp.add(word)
    print('word_nums in pre-embedding:%d/%d, radio:%.4f' %
          (len(tmp), len(vocab), len(tmp) / len(vocab)))

    w2i = {'<pad>': 0}
    i2w = {0: '<pad>'}
    c = 1
    embedding = np.zeros([len(tmp) + 3, model_zh.vector_size])
    for word in tmp:
        w2i[word] = c
        i2w[c] = word
        if word in model_zh:
            embedding[c] = model_zh[word]
        elif word in model_en:
            embedding[c] = model_en[word]
        c += 1
    w2i['<unk>'] = len(tmp) + 1
    i2w[len(tmp) + 1] = '<unk>'
    w2i[' '] = len(tmp) + 2
    i2w[len(tmp) + 2] = ' '
    lang = {'w2i': w2i, 'i2w': i2w}
    print('vocab size:%d' % (c + 2))
    print('embedding size:', embedding.shape)

    # save
    with open(vocab_path, 'wb') as file:
        pickle.dump(lang, file)
    np.save(embedding_out, embedding)
Ejemplo n.º 3
0
def gen_tag_index(df):
    df = df[['title', 'content', 'question']]
    data = df.values.flatten().tolist()
    tag2i = {'<pad>': 0, '<unk>': 1}
    cc = 2
    for d in data:
        if utils.is_zh_or_en(d):
            _, tags = utils.split_word_zh(d, have_tag=True)
        else:
            _, tags = utils.split_word_en(d, have_tag=True)

        for t in tags:
            if t not in tag2i:
                tag2i[t] = cc
                cc += 1

    with open(config.tag_path, 'wb') as file:
        pickle.dump(tag2i, file)
    print('word flag num:%d' % len(tag2i))  # 98个
Ejemplo n.º 4
0
def shorten_content_all(df, max_len):
    """
    :param df:
    :param max_len:
    :return: df
    """
    sys.setrecursionlimit(1000000)
    rouge = Rouge(metrics=['rouge-l'])

    def match(title, content, question, max_len):

        title_is_zh = utils.is_zh_or_en(title)
        content_is_zh = utils.is_zh_or_en(content)

        if title_is_zh:
            title_list = utils.split_word_zh(title)
        else:
            title_list = utils.split_word_en(title)

        def count(flag, content_list):
            """ 查数 """
            number = 0
            for i in range(len(flag)):
                if flag[i] != 0:
                    number += len(content_list[i])+1
            return number

        # 过滤
        if content_is_zh:
            title_number = len(title_list)
            content_number = len(utils.split_word_zh(content))
            if (title_number + content_number + 1) <= max_len:
                return content

        if content_is_zh:
            if '。' not in content:
                c_list = utils.split_word_zh(content)
                c_list = c_list[: config.max_len-len(title_list)-1]
                return ''.join(c_list)

            content_list = content.split('。')
            temp = []
            for c in content_list:
                if c not in ['', ' ', '  ']:
                    temp.append(c)
            content_list = temp
            content_list = [utils.split_word_zh(c) for c in content_list]
            content_list = [title_list] + content_list
            content_len = len(content_list)

            if utils.is_zh_or_en(question):
                question_list = utils.split_word_zh(question)
            else:
                question_list = utils.split_word_en(question)
            question_str = ' '.join(question_list)

            # 相似性得分: rouge-l
            scores = []
            for c in content_list:
                if ''.join(c) in question:
                    scores.append(-5)
                    continue
                c_str = ' '.join(c)
                score = rouge.get_scores(c_str, question_str, avg=True)['rouge-l']['r']
                scores.append(score)

            # 标记类型
            flag = np.zeros(content_len)
            # title_number = len(utils.split_word_zh(title))
            # max_len = max_len - title_number

            # 核心句:
            max_score = max(scores)
            for i in range(content_len):
                if scores[i] == max_score:
                    flag[i] = -1

            # 核心句下一句
            for i in range(content_len):
                if (flag[i] == -1) and (i+1 < content_len) and (flag[i+1] == 0):
                    flag[i+1] = -2

            # 最后一句
            if flag[-1] == 0:
                flag[-1] = -3

            # 第一句
            if flag[1] == 0:
                flag[1] = -4

            # 蕴含句(上+中+下)
            for i in range(content_len):
                if scores[i] == -5:
                    if (i-1 >= 0) and (flag[i-1] == 0):
                        flag[i-1] = -5
                    if flag[i] == 0:
                        flag[i] = -5
                    if (i+1 < content_len) and (flag[i+1] == 0):
                        flag[i+1] = -5

            # 核心句下下句
            for i in range(content_len):
                if (flag[i] == -1) and (i+2 < content_len) and (flag[i+2] == 0):
                    flag[i+2] = -6

            # 核心句上一句
            for i in range(content_len):
                if (flag[i] == -1) and (i-1 >= 0) and (flag[i-1] == 0):
                    flag[i-1] = -7

            # 核心句下下下句
            for i in range(content_len):
                if (flag[i] == -1) and (i+3 < content_len) and (flag[i+3] == 0):
                    flag[i+3] = -8

            # 核心句上上句
            for i in range(content_len):
                if (flag[i] == -1) and (i-2 >= 0) and (flag[i-2] == 0):
                    flag[i-2] = -9

            # 倒数第二句
            if(len(flag) >= 3) and (flag[-2] == 0):
                flag[-2] = -10

            # 第二句
            if (len(flag) >= 3) and (flag[2] == 0):
                flag[2] = -11

            flag[0] = 0
            number = count(flag, content_list)
            max_len = max_len - len(title_list)
            result = []
            if number <= max_len:
                for i in range(content_len):
                    if flag[i] != 0:
                        result.append(''.join(content_list[i]))
            else:
                flag_copy = np.zeros(content_len)
                c_count = 0
                xxx = True
                for i in range(-1, -12, -1):
                    for j in range(len(flag)):
                        if flag[j] == i:
                            c_count = c_count + len(content_list[j]) + 1
                            if c_count <= max_len:
                                flag_copy[j] = -1
                            else:
                                xxx = False
                                break
                    if xxx is False:
                        break

                for i in range(content_len):
                    if flag_copy[i] == -1:
                        result.append(''.join(content_list[i]))

                if len(result) == 0:
                    for j in range(len(flag)):
                        if flag[j] == -1:
                            result = [''.join(content_list[j][: max_len-1])]
                            break

            # 过滤重复
            temp = []
            for r in result:
                if r not in temp:
                    temp.append(r)
            result = temp

            return '。'.join(result)

        else:
            www = content
            words = utils.split_word_en(www)
            if (len(words) + len(title_list) + 1) <= config.max_len:
                return content
            else:
                index = 0
                for i in words[: config.max_len-len(title_list)-1]:
                    index = index + len(i)

                while content[index] != ' ':
                    index = index + 1

                return content[: index]

    titles = df['title'].values
    contents = df['content'].values
    questions = df['question'].values

    shorten_content = [match(t, c, q, max_len) for t, c, q in zip(titles, contents, questions)]
    df['shorten_content'] = shorten_content

    # 评估数据集构建效果
    if 'answer' in df:

        answers = df['answer'].values

        is_in = [True if (a in c) or (a in t) else False for c, t, a in zip(contents, titles, answers)]
        r1 = sum(is_in)/len(df)
        print('答案存在比例:%.4f' % r1)

        is_in = [True if (a in t) or (a in m) else False for t, m, a in zip(titles, shorten_content, answers)]
        df['is_in'] = is_in
        r2 = sum(is_in)/len(df)
        print('截取比例:%.4f' % r2)

        print('截取准确率:%.4f' % (r2/r1))

    merge_len = []
    for t, s in zip(titles, shorten_content):
        if utils.is_zh_or_en(t):
            len_t = len(utils.split_word_zh(t))
        else:
            len_t = len(utils.split_word_en(t))

        if utils.is_zh_or_en(s):
            len_s = len(utils.split_word_zh(s))
        else:
            len_s = len(utils.split_word_en(s))
        len_m = len_t + len_s + 1
        merge_len.append(len_m)

    df['len'] = merge_len
    print('max length: %d' % max(merge_len))
    print('min length: %d' % min(merge_len))
    print('mean length:%d' % df['len'].mean())
    print('median length:%d' % df['len'].median())

    return df
Ejemplo n.º 5
0
    def match(title, content, question, max_len):

        title_is_zh = utils.is_zh_or_en(title)
        content_is_zh = utils.is_zh_or_en(content)

        if title_is_zh:
            title_list = utils.split_word_zh(title)
        else:
            title_list = utils.split_word_en(title)

        def count(flag, content_list):
            """ 查数 """
            number = 0
            for i in range(len(flag)):
                if flag[i] != 0:
                    number += len(content_list[i])+1
            return number

        # 过滤
        if content_is_zh:
            title_number = len(title_list)
            content_number = len(utils.split_word_zh(content))
            if (title_number + content_number + 1) <= max_len:
                return content

        if content_is_zh:
            if '。' not in content:
                c_list = utils.split_word_zh(content)
                c_list = c_list[: config.max_len-len(title_list)-1]
                return ''.join(c_list)

            content_list = content.split('。')
            temp = []
            for c in content_list:
                if c not in ['', ' ', '  ']:
                    temp.append(c)
            content_list = temp
            content_list = [utils.split_word_zh(c) for c in content_list]
            content_list = [title_list] + content_list
            content_len = len(content_list)

            if utils.is_zh_or_en(question):
                question_list = utils.split_word_zh(question)
            else:
                question_list = utils.split_word_en(question)
            question_str = ' '.join(question_list)

            # 相似性得分: rouge-l
            scores = []
            for c in content_list:
                if ''.join(c) in question:
                    scores.append(-5)
                    continue
                c_str = ' '.join(c)
                score = rouge.get_scores(c_str, question_str, avg=True)['rouge-l']['r']
                scores.append(score)

            # 标记类型
            flag = np.zeros(content_len)
            # title_number = len(utils.split_word_zh(title))
            # max_len = max_len - title_number

            # 核心句:
            max_score = max(scores)
            for i in range(content_len):
                if scores[i] == max_score:
                    flag[i] = -1

            # 核心句下一句
            for i in range(content_len):
                if (flag[i] == -1) and (i+1 < content_len) and (flag[i+1] == 0):
                    flag[i+1] = -2

            # 最后一句
            if flag[-1] == 0:
                flag[-1] = -3

            # 第一句
            if flag[1] == 0:
                flag[1] = -4

            # 蕴含句(上+中+下)
            for i in range(content_len):
                if scores[i] == -5:
                    if (i-1 >= 0) and (flag[i-1] == 0):
                        flag[i-1] = -5
                    if flag[i] == 0:
                        flag[i] = -5
                    if (i+1 < content_len) and (flag[i+1] == 0):
                        flag[i+1] = -5

            # 核心句下下句
            for i in range(content_len):
                if (flag[i] == -1) and (i+2 < content_len) and (flag[i+2] == 0):
                    flag[i+2] = -6

            # 核心句上一句
            for i in range(content_len):
                if (flag[i] == -1) and (i-1 >= 0) and (flag[i-1] == 0):
                    flag[i-1] = -7

            # 核心句下下下句
            for i in range(content_len):
                if (flag[i] == -1) and (i+3 < content_len) and (flag[i+3] == 0):
                    flag[i+3] = -8

            # 核心句上上句
            for i in range(content_len):
                if (flag[i] == -1) and (i-2 >= 0) and (flag[i-2] == 0):
                    flag[i-2] = -9

            # 倒数第二句
            if(len(flag) >= 3) and (flag[-2] == 0):
                flag[-2] = -10

            # 第二句
            if (len(flag) >= 3) and (flag[2] == 0):
                flag[2] = -11

            flag[0] = 0
            number = count(flag, content_list)
            max_len = max_len - len(title_list)
            result = []
            if number <= max_len:
                for i in range(content_len):
                    if flag[i] != 0:
                        result.append(''.join(content_list[i]))
            else:
                flag_copy = np.zeros(content_len)
                c_count = 0
                xxx = True
                for i in range(-1, -12, -1):
                    for j in range(len(flag)):
                        if flag[j] == i:
                            c_count = c_count + len(content_list[j]) + 1
                            if c_count <= max_len:
                                flag_copy[j] = -1
                            else:
                                xxx = False
                                break
                    if xxx is False:
                        break

                for i in range(content_len):
                    if flag_copy[i] == -1:
                        result.append(''.join(content_list[i]))

                if len(result) == 0:
                    for j in range(len(flag)):
                        if flag[j] == -1:
                            result = [''.join(content_list[j][: max_len-1])]
                            break

            # 过滤重复
            temp = []
            for r in result:
                if r not in temp:
                    temp.append(r)
            result = temp

            return '。'.join(result)

        else:
            www = content
            words = utils.split_word_en(www)
            if (len(words) + len(title_list) + 1) <= config.max_len:
                return content
            else:
                index = 0
                for i in words[: config.max_len-len(title_list)-1]:
                    index = index + len(i)

                while content[index] != ' ':
                    index = index + 1

                return content[: index]