Exemple #1
0
def calc_trans_prob(data):
    trans_dict = {}
    # init trans dict
    for tag in tag_set:
        trans_dict[tag] = {}

    for sentence in data:
        sentence = sentence.strip().split(' ')
        for index in range(len(sentence)):
            word = sentence[index]
            # 注:使用/切分 是有问题的 比如 吨/日/q 需要合并
            word = word.split('/')
            word_tag = word[-1]
            if index != len(sentence) - 1:
                next_tag = sentence[index + 1].split('/')[-1]
                try:
                    trans_dict[word_tag][next_tag] += 1
                except:
                    trans_dict[word_tag][next_tag] = 1

    print(trans_dict)
    print()
    print()

    # 计算概率
    for tag in trans_dict:
        values = trans_dict[tag].values()
        print("tag all_emit_words_seen: ", tag, sum(values))
        one_tag_trans_num = sum(values)
        for next_tag in trans_dict[tag]:
            trans_dict[tag][next_tag] /= one_tag_trans_num
    print(trans_dict)
    store_dict(trans_dict, "trans_prob.pkl")
Exemple #2
0
def calc_emit_prob(data):
    # 使用二维dict 存储emit计数
    # 这里是从隐含状态到观测状态的dict
    emit_dict = {}
    # init emit dict
    for tag in tag_set:
        emit_dict[tag] = {}
    for sentence in data:
        sentence = sentence.strip().split(' ')
        for word in sentence:
            # 注:使用/切分 是有问题的 比如 吨/日/q 需要合并
            word = word.split('/')
            combine_word = ('/').join(word[:-1])
            try:
                emit_dict[word[-1]][combine_word] += 1
            except:
                emit_dict[word[-1]][combine_word] = 1

    # 计算概率
    for tag in emit_dict:
        values = emit_dict[tag].values()
        # 每一个tag的计数
        one_tag_stats_num = sum(values)
        # 添加对tag中所有不重复单词的计数
        for key in emit_dict[tag]:
            emit_dict[tag][key] /= one_tag_stats_num
    store_dict(emit_dict, "emit_prob" + suffix + ".pkl")
Exemple #3
0
def calc_emit_prob_add_one(data):
    # 首先统计出一共有多少类别 下面是统计结果
    tag_set = {
        'na', '', 'l', 'c', 'Rg', 'm', 'o', 'r', 'nx', 'ns', 'Vg', 'an', 'vd',
        'a', 'Yg', 'nt', 'Dg', 'k', 'h', 'i', 'd', 'Ag', 'q', 'b', 'w', 'u',
        'p', 'f', 'Mg', 'ad', 'Bg', 's', 'z', 'y', 'n', 't', 'vn', 'j', 'Tg',
        'e', 'v', 'Ng', 'nz', 'nr'
    }

    # 使用二维dict 存储emit计数
    # 这里是从隐含状态到观测状态的dict
    emit_dict = {}
    # init emit_dict
    for tag in tag_set:
        emit_dict[tag] = {}
    for sentence in data:
        sentence = sentence.strip().split(' ')
        for word in sentence:
            # 注:使用/切分 是有问题的 比如 吨/日/q 需要合并
            word = word.split('/')
            # print(word)
            combine_word = ('/').join(word[:-1])
            # print(combine_word)
            # print(word)
            try:
                emit_dict[word[-1]][combine_word] += 1
            except:
                emit_dict[word[-1]][combine_word] = 1
    # print(emit_dict)

    # 计算概率
    # emit_prob deepcopy from emit_dict
    for tag in emit_dict:
        values = emit_dict[tag].values()
        # print(values)
        # print("tag all_emit_words_seen: ", tag, sum(values))
        # 每一个tag的计数
        one_tag_stats_num = sum(values)
        # 添加对tag中所有不重复单词的计数
        one_tag_word_cnt = len(emit_dict[tag]) + 1
        # 加1平滑的分母
        smooth_num = one_tag_stats_num + one_tag_word_cnt
        # 对所有集外词/未登陆词 平滑后的概率
        smooth_prob_for_oov = 1 / smooth_num
        emit_dict[tag]['oov'] = smooth_prob_for_oov

        for key in emit_dict[tag]:
            # print(key)
            emit_dict[tag][key] += 1
            emit_dict[tag][key] /= smooth_num
    print(emit_dict)
    store_dict(emit_dict, "emit_prob" + suffix + ".pkl")
Exemple #4
0
def calc_trans_prob_simple_smooth(data):

    trans_dict = {}
    # init trans_dict
    for tag in tag_set:
        trans_dict[tag] = {}
        for next_tag in tag_set:
            trans_dict[tag][next_tag] = 0

    for sentence in data:
        # strip 去掉收尾空格和换行
        sentence = sentence.strip().split(' ')
        # print(sentence)
        for index in range(len(sentence)):
            word = sentence[index]
            # 注:使用/切分 是有问题的 比如 吨/日/q 需要合并
            word = word.split('/')
            word_tag = word[-1]
            # print(word_tag)
            if index != len(sentence) - 1:
                next_tag = sentence[index + 1].split('/')[-1]
                try:
                    trans_dict[word_tag][next_tag] += 1
                except:
                    trans_dict[word_tag][next_tag] = 1

    # print(trans_dict)
    # print()
    # print()

    # 计算概率
    # emit_prob deepcopy from emit_dict
    for tag in trans_dict:
        # print("tag: ", tag) # 遇到问题 拟声词o在划分数据集时 经常划分不到训练集中 导致这里报错

        values = trans_dict[tag].values()
        # print(values)
        # print("tag all_emit_words_seen: ", tag, sum(values))
        one_tag_trans_num = sum(values)
        # for rag o
        if one_tag_trans_num == 0:
            for next_tag in trans_dict[tag]:
                trans_dict[tag][next_tag] = trans_smooth_prob
        else:
            for next_tag in trans_dict[tag]:
                # print(next_tag)
                trans_dict[tag][next_tag] /= one_tag_trans_num
                if trans_dict[tag][next_tag] == 0:
                    trans_dict[tag][next_tag] = trans_smooth_prob

    # print(trans_dict)
    store_dict(trans_dict, "trans_prob_new" + suffix + ".pkl")
Exemple #5
0
def calc_start_prob(data):
    start_tag_list = []
    for sentence in data:
        sentence = sentence.strip()
        start_word = sentence.split(' ')[0]
        start_tag = start_word.split('/')[-1]
        start_tag_list.append(start_tag)
    start_number = len(start_tag_list)

    count = Counter(start_tag_list)
    start_count_dict = dict(count)
    for key in start_count_dict:
        start_count_dict[key] /= start_number

    # 对0概率的tag进行简单平滑
    for tag in tag_set:
        try:
            start_count_dict[tag]
        except:
            start_count_dict[tag] = start_smooth_prob

    store_dict(start_count_dict, "start_prob" + suffix + ".pkl")
Exemple #6
0
def calc_trans_prob_simple_smooth(data):

    trans_dict = {}
    # init trans_dict
    for tag in tag_set:
        trans_dict[tag] = {}
        for next_tag in tag_set:
            trans_dict[tag][next_tag] = 0

    for sentence in data:
        # strip 去掉收尾空格和换行
        sentence = sentence.strip().split(' ')
        for index in range(len(sentence)):
            word = sentence[index]
            # 注:使用/切分 是有问题的 比如 吨/日/q 需要合并
            word = word.split('/')
            word_tag = word[-1]
            if index != len(sentence) - 1:
                next_tag = sentence[index + 1].split('/')[-1]
                try:
                    trans_dict[word_tag][next_tag] += 1
                except:
                    trans_dict[word_tag][next_tag] = 1

    # 计算概率
    for tag in trans_dict:
        values = trans_dict[tag].values()
        one_tag_trans_num = sum(values)
        if one_tag_trans_num == 0:
            for next_tag in trans_dict[tag]:
                trans_dict[tag][next_tag] = trans_smooth_prob
        else:
            for next_tag in trans_dict[tag]:
                trans_dict[tag][next_tag] /= one_tag_trans_num
                if trans_dict[tag][next_tag] == 0:
                    trans_dict[tag][next_tag] = trans_smooth_prob

    store_dict(trans_dict, "trans_prob_new" + suffix + ".pkl")
Exemple #7
0
def calc_emit_prob_add_one(data):

    # 使用二维dict 存储emit计数
    # 这里是从隐含状态到观测状态的dict
    emit_dict = {}
    # init emit_dict
    for tag in tag_set:
        emit_dict[tag] = {}
    for sentence in data:
        sentence = sentence.strip().split(' ')
        for word in sentence:
            # 注:使用/切分 是有问题的 比如 吨/日/q 需要合并
            word = word.split('/')
            combine_word = ('/').join(word[:-1])
            try:
                emit_dict[word[-1]][combine_word] += 1
            except:
                emit_dict[word[-1]][combine_word] = 1

    # 计算概率
    for tag in emit_dict:
        values = emit_dict[tag].values()
        # 每一个tag的计数
        one_tag_stats_num = sum(values)
        # 添加对tag中所有不重复单词的计数
        one_tag_word_cnt = len(emit_dict[tag]) + 1
        # 加1平滑的分母
        smooth_num = one_tag_stats_num + one_tag_word_cnt
        # 对所有集外词/未登陆词 平滑后的概率
        smooth_prob_for_oov = 1 / smooth_num
        emit_dict[tag]['oov'] = smooth_prob_for_oov

        for key in emit_dict[tag]:
            emit_dict[tag][key] += 1
            emit_dict[tag][key] /= smooth_num
    print(emit_dict)
    store_dict(emit_dict, "emit_prob" + suffix + ".pkl")
Exemple #8
0
def calc_emit_prob(data):
    # 使用二维dict 存储emit计数
    # 这里是从隐含状态到观测状态的dict
    emit_dict = {}
    # init emit_dict
    for tag in tag_set:
        emit_dict[tag] = {}
    for sentence in data:
        sentence = sentence.strip().split(' ')
        for word in sentence:
            # 注:使用/切分 是有问题的 比如 吨/日/q 需要合并
            word = word.split('/')
            # print(word)
            combine_word = ('/').join(word[:-1])
            # print(combine_word)
            # print(word)
            try:
                emit_dict[word[-1]][combine_word] += 1
            except:
                emit_dict[word[-1]][combine_word] = 1
    # print(emit_dict)

    # 计算概率
    # emit_prob deepcopy from emit_dict
    for tag in emit_dict:
        values = emit_dict[tag].values()
        # print(values)
        # print("tag all_emit_words_seen: ", tag, sum(values))
        # 每一个tag的计数
        one_tag_stats_num = sum(values)
        # 添加对tag中所有不重复单词的计数
        # one_tag_word_cnt = len(emit_dict[tag])
        for key in emit_dict[tag]:
            # print(key)
            emit_dict[tag][key] /= one_tag_stats_num
    # print(emit_dict)
    store_dict(emit_dict, "emit_prob" + suffix + ".pkl")
Exemple #9
0
def calc_trans_prob_gt_smoothing(data):

    tag_set_size = len(tag_set)

    trans_dict = {}
    # init trans_dict
    for tag in tag_set:
        trans_dict[tag] = {}
        for next_tag in tag_set:
            trans_dict[tag][next_tag] = 0

    for sentence in data:
        # strip 去掉收尾空格和换行
        sentence = sentence.strip().split(' ')
        for index in range(len(sentence)):
            word = sentence[index]
            # 注:使用/切分 是有问题的 比如 吨/日/q 需要合并
            word = word.split('/')
            word_tag = word[-1]
            if index != len(sentence) - 1:
                next_tag = sentence[index + 1].split('/')[-1]
                try:
                    trans_dict[word_tag][next_tag] += 1
                except:
                    trans_dict[word_tag][next_tag] = 1

    # debug
    all_nr_cnt = 0
    '''
    for tag in trans_dict:
        values = trans_dict[tag].values()
        one_tag_stats_num = sum(values)

        # 计算最大转移的最大计数
        max_cnt = max(values)
        gt_number_dict = {i:[] for i in range(max_cnt+1)}
        # 计算转移一次的list
        for key in trans_dict[tag]:
            gt_number_dict[trans_dict[tag][key]].append(key)

        # print(gt_number_dict)
        gt_nr_list = [0 for i in range(max_cnt+1)]

        nr_cnt = 0
        for index in gt_number_dict:
            gt_nr_list[index] = len(gt_number_dict[index]) # len of list
            nr_cnt += gt_nr_list[index] * index

        # nr_cnt = tag_set_size - gt_nr_list[0]

        # paint to explore
        print("example tag: ", tag)
        print(gt_nr_list)
        print(nr_cnt)
        all_nr_cnt += nr_cnt

        # 完全平滑 左0 都不平滑
        # r or nr?
        # r是出现次数 nr是对出现次数为r进行计数 所以频率应该以r/N
        # 平滑之后r变为dr 频率就变为dr/N
        # using gt smoothing
        dr = [0 for i in range(max_cnt+1)]

        dr[max_cnt] = max_cnt / nr_cnt
        for i in range(1, max_cnt):
            if gt_nr_list[i] != 0 and gt_nr_list[i+1] != 0:
                temp_dr = (i+1) * gt_nr_list[i+1] / gt_nr_list[i]
                dr[i] = temp_dr / nr_cnt
            else:
                gt_nr_list[i] = i / nr_cnt
        remain_prob = 1 - sum(dr[1:])
        n0 = gt_nr_list[0]
        dr[0]= remain_prob /n0
        if dr[0] < 0:
            print(dr)
            print("panic")
            exit()


    print(all_nr_cnt)
    print(dr)

        # 归一化
    exit()
    '''
    ###
    # 计算概率
    # emit_prob deepcopy from emit_dict
    # debug:
    sum_prob_all = []
    using_gt_tag_num = 0
    for tag in trans_dict:
        values = trans_dict[tag].values()
        # print(values)
        # print("tag all_emit_words_seen: ", tag, sum(values))
        one_tag_trans_num = sum(values)
        if one_tag_trans_num < 100000:
            using_gt_tag_num += 1
            for next_tag in trans_dict[tag]:
                # print(next_tag)
                trans_dict[tag][next_tag] /= one_tag_trans_num
                # add simple smooth for 0 value
                if trans_dict[tag][next_tag] == 0:
                    trans_dict[tag][next_tag] = trans_smooth_prob

            # print(trans_dict)
            # store_dict(trans_dict, "trans_prob_new" + suffix + ".pkl")
        else:
            # gt smooth
            # 计算最大转移的最大计数
            max_cnt = max(values)
            gt_number_dict = {i: [] for i in range(max_cnt + 1)}
            # 计算转移一次的list
            for key in trans_dict[tag]:
                gt_number_dict[trans_dict[tag][key]].append(key)

            # print("gt_number_dict: ", gt_number_dict)
            gt_nr_list = [0 for i in range(max_cnt + 1)]

            nr_cnt = 0
            for index in gt_number_dict:
                gt_nr_list[index] = len(gt_number_dict[index])  # len of list
                nr_cnt += gt_nr_list[index] * index

            # nr_cnt = tag_set_size - gt_nr_list[0]

            # paint to explore
            print("example tag: ", tag)
            print(gt_nr_list)
            print(nr_cnt)
            all_nr_cnt += nr_cnt

            # 完全平滑 左0 都不平滑
            # r or nr?
            # r是出现次数 nr是对出现次数为r进行计数 所以频率应该以r/N
            # 平滑之后r变为dr 频率就变为dr/N
            # using gt smoothing
            dr = [0 for i in range(max_cnt + 1)]

            dr[max_cnt] = max_cnt / nr_cnt
            for i in range(1, max_cnt):
                if gt_nr_list[i] != 0 and gt_nr_list[i + 1] != 0:
                    temp_dr = (i + 1) * gt_nr_list[i + 1] / gt_nr_list[i]
                    dr[i] = temp_dr / nr_cnt
                else:
                    gt_nr_list[i] = i / nr_cnt
            remain_prob = 1 - sum(dr[1:])
            n0 = gt_nr_list[0]
            dr[0] = remain_prob / n0
            if dr[0] < 0:
                print(dr)
                print("panic")
                exit()
            print(all_nr_cnt)
            print(dr)

            # to trans_dict
            # index as key
            for index in gt_number_dict:
                for next_tag in gt_number_dict[index]:
                    trans_dict[tag][next_tag] = dr[index]

            # 归一化
            sum_prob = 0
            for i in range(max_cnt + 1):
                sum_prob += gt_nr_list[i] * dr[i]
            print("sum_prob: ", sum_prob)

            for index in gt_number_dict:
                for next_tag in gt_number_dict[index]:
                    trans_dict[tag][next_tag] = dr[index] / sum_prob

            sum_prob_all.append(sum_prob)

        # end if else
    # end for
    print("using_gt_tag_num: ", using_gt_tag_num)
    # exit()
    print(trans_dict)
    print("sum_prob_all: ", sum_prob_all)
    # exit()
    store_dict(trans_dict, "trans_prob_new" + suffix + ".pkl")