Beispiel #1
0
def main():
    print('proprecessing...')

    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    # 构建字典
    columns = config['model_params']['feature_names'] + ['label']
    min_counts_dict, path_vocs_dict = defaultdict(int), dict()
    feature_names = config['model_params']['feature_names']
    for feature_name in feature_names:
        min_counts_dict[feature_name] = \
            config['data_params']['voc_params'][feature_name]['min_count']
        path_vocs_dict[feature_name] = \
            config['data_params']['voc_params'][feature_name]['path']
    path_vocs_dict['label'] = \
        config['data_params']['voc_params']['label']['path']
    build_vocabulary(path_data=config['data_params']['path_train'],
                     columns=columns,
                     min_counts_dict=min_counts_dict,
                     path_vocs_dict=path_vocs_dict)

    # 构建embedding表
    for feature_name in feature_names:
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path_pre_train']
        if not path_pre_train:
            continue
        path_pkl = config['model_params']['embed_params'][feature_name]['path']
        path_voc = config['data_params']['voc_params'][feature_name]['path']
        with open(path_voc, 'rb') as file_r:
            voc = pickle.load(file_r)
        embedding_dict, vec_dim = load_embed_from_txt(path_pre_train)
        embedding_matrix = np.zeros((len(voc.keys()) + 1, vec_dim),
                                    dtype='float32')
        for item in voc:
            if item in embedding_dict:
                embedding_matrix[voc[item], :] = embedding_dict[item]
            else:
                embedding_matrix[voc[item], :] = np.random.uniform(
                    -0.25, 0.25, size=(vec_dim))
        with open(path_pkl, 'wb') as file_w:
            pickle.dump(embedding_matrix, file_w)

    print('all done!')
def main():
    print('proprecessing...')

    # 加载配置文件
    with open('./config1.yml') as file_config:
        config = yaml.load(file_config)

    # 构建字典(同时获取词表size,序列最大长度)
    columns = config['model_params']['feature_names'] + ['label']
    min_counts_dict, path_vocs_dict = defaultdict(int), dict()
    feature_names = config['model_params']['feature_names']
    for feature_name in feature_names:
        min_counts_dict[feature_name] = \
            config['data_params']['voc_params'][feature_name]['min_count']
        path_vocs_dict[feature_name] = \
            config['data_params']['voc_params'][feature_name]['path']
    path_vocs_dict['label'] = \
        config['data_params']['voc_params']['label']['path']
    voc_sizes, sequence_length = build_vocabulary(
        path_data=config['data_params']['path_train'], columns=columns,
        min_counts_dict=min_counts_dict, path_vocs_dict=path_vocs_dict)

    # 构建embedding表
    feature_dim_dict = dict()  # 存储每个feature的dim
    for i, feature_name in enumerate(feature_names):
        path_pre_train = config['model_params']['embed_params'][feature_name]['path_pre_train']
        if not path_pre_train:
            if i == 0:
                feature_dim_dict[feature_name] = 64
            else:
                feature_dim_dict[feature_name] = 32
            continue
        path_pkl = config['model_params']['embed_params'][feature_name]['path']
        path_voc = config['data_params']['voc_params'][feature_name]['path']
        with open(path_voc, 'rb') as file_r:
            voc = pickle.load(file_r)
        embedding_dict, vec_dim = load_embed_from_txt(path_pre_train)
        feature_dim_dict[feature_name] = vec_dim
        embedding_matrix = np.zeros((len(voc.keys())+1, vec_dim), dtype='float32')
        for item in voc:
            if item in embedding_dict:
                embedding_matrix[voc[item], :] = embedding_dict[item]
            else:
                embedding_matrix[voc[item], :] = np.random.uniform(-0.25, 0.25, size=(vec_dim))
        with open(path_pkl, 'wb') as file_w:
            pickle.dump(embedding_matrix, file_w)

    # 修改config中各个特征的shape,embedding大小默认为[64, 32, 32, ... , 32]
    label_size = voc_sizes[-1]
    voc_sizes = voc_sizes[:-1]
    # 修改nb_classes
    config['model_params']['nb_classes'] = label_size + 1
    for i, feature_name in enumerate(feature_names):
        if i == 0:
            config['model_params']['embed_params'][feature_name]['shape'] = \
                [voc_sizes[i]+1, feature_dim_dict[feature_name]]
        else:
            config['model_params']['embed_params'][feature_name]['shape'] = \
                [voc_sizes[i]+1, feature_dim_dict[feature_name]]
    # 写入文件
    with codecs.open('./config1.yml', 'w', encoding='utf-8') as file_w:
        yaml.dump(config, file_w)

    print('all done!')
def main():
    logger.info('preprocessing...')
    useable = []
    # 加载配置文件
    with open('./config.yml', encoding="utf-8") as file_config:
        config = yaml.load(file_config)

    # 构建字典(同时获取词表size,序列最大长度), f1 f2 label 名称固定不能更改
    # 输入特征[f1]或者[f1, f2], f1: 汉字或者英语词汇,f2:词性, 加上一个为预测label
    columns = config['model_params']['feature_names'] + ['label']
    min_counts_dict, path_vocs_dict = defaultdict(
        int), dict()  # 用来过滤的最小频数,整理编号词汇的保存路径
    feature_names = config['model_params']['feature_names']  # 输入信号的特征
    logger.info("feature_names: " + str(feature_names))
    for feature_name in feature_names:
        min_counts_dict[feature_name] = config['data_params']['voc_params'][
            feature_name]['min_count']
        path_vocs_dict[feature_name] = config['data_params']['voc_params'][
            feature_name]['path']
    # label的编号保存路径
    path_vocs_dict['label'] = config['data_params']['voc_params']['label'][
        'path']
    logger.info("min_count: " + str(min_counts_dict))
    logger.info(path_vocs_dict)

    # char feature  char 命名也是固定不能更改
    min_counts_dict['char'] = config['data_params']['voc_params']['char'][
        'min_count']
    path_vocs_dict['char'] = config['data_params']['voc_params']['char'][
        'path']

    sequence_len_pt = config['model_params']['sequence_len_pt']  # 句子长度覆盖分位数
    use_char_feature = config['model_params'][
        'use_char_feature']  # 是否属于英文本文本并采用字符信息
    word_len_pt = config['model_params']['word_len_pt']  # 英文文本每个字符的长度控制
    # 将输入用到的输入和输出特征建立遍序号字典保存
    # voc_sizes, lengths = build_vocabulary(
    #     path_data=config['data_params']['path_train'],
    #     columns=columns,
    #     min_counts_dict=min_counts_dict,
    #     path_vocs_dict=path_vocs_dict,
    #     sequence_len_pt=sequence_len_pt,
    #     use_char_featrue=use_char_feature,
    #     word_len_pt=word_len_pt
    # )

    voc_sizes = [
        get_voc_dict(["../data/train.txt", "../data/test.txt"], 2),
        get_tag_dict("../data/lstm_crf/train.txt", 1)
    ]
    lengths = [128]
    logger.info(voc_sizes)
    if not use_char_feature:
        sequence_length = lengths[0]  # 预测句子长度
    else:
        sequence_length, word_length = lengths[:]  # 或者英语用到字符的信号

    # 构建embedding表, 对每个输入的特征进行embed
    logger.info(
        "get feature pre_train matrix...")  # 模型自带一个word2vec层, word2vec的初始化权重
    feature_dim_dict = dict()  # 存储每个feature的dim
    for i, feature_name in enumerate(feature_names):
        logger.info("feature: " + feature_name)
        # embed size
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path_pre_train']  # embed 结果保存位置
        if not path_pre_train:
            # 检查嵌入维度初始化权重是否存在,如果不存在为啥还需要给定一个默认维度值, 改为None, 该特征不可用
            if i == 0:
                feature_dim_dict[feature_name] = None
            else:
                feature_dim_dict[feature_name] = None
            continue
        useable.append(feature_name)
        config['model_params']['embed_params'][feature_name][
            'path'] = "../data/lstm_crf/%s_embed.mat.pkl" % feature_name

        path_voc = config['data_params']['voc_params'][feature_name][
            'path']  # 前面的特征词典编号文件位置
        with open(path_voc, 'rb') as file_r:  # 二进制打开文件
            # 通过train数据编号的词典
            voc = pickle.load(file_r)
            logger.info("编号词典:%s " % voc)

        logger.info("将构建的voc,与训练好的embedding结合整理出word2vec的初始化矩阵: " +
                    feature_name)
        embedding_dict, vec_dim = load_embed_from_txt(
            path_pre_train)  # 读取嵌入词汇模型

        # # 对于未标注的词汇作为,一些词汇可能不在train里面
        # embedding_special_words =[]
        # for key in embedding_dict:
        #     if key not in voc:
        #         embedding_special_words.append(key)
        # logger.info("embedding 里面包含一些词汇可能voc来面没有,voc来之train data的整理")
        # logger.info(len(embedding_special_words))
        # logger.info(embedding_special_words)
        # # 是否将这些词汇加入voc中,值得思考: 这里选择加入
        # voc_end_index = max(voc.values())
        # for key in embedding_special_words:
        #     voc[key] = voc_end_index + 1

        feature_dim_dict[feature_name] = vec_dim  # 每个特征的嵌入维度
        embedding_matrix = np.zeros((len(voc.keys()) + 2, vec_dim),
                                    dtype='float32')  # 第一行为填充
        embedding_matrix[1, :] = embedding_dict[config["unknow_word"]]
        for item in voc:
            if item in embedding_dict:
                embedding_matrix[voc[item], :] = embedding_dict[item]
            else:
                # voc里面的词汇可能在pre train embed里面没有
                logger.info("训练好的embedding中没有找到的词汇:%s" % item)
                embedding_matrix[voc[item], :] = np.random.uniform(
                    -0.25, 0.25, size=vec_dim)  # embed钟未登录词的处理
        with open(config['model_params']['embed_params'][feature_name]['path'],
                  'wb') as file_w:
            pickle.dump(embedding_matrix, file_w)
        # print(embedding_matrix)
    # 修改config中各个特征的shape,embedding大小默认为[64, 32, 32, ...]
    if use_char_feature:
        char_voc_size = voc_sizes.pop(0)
    label_size = voc_sizes[-1]
    voc_sizes = voc_sizes[:-1]  # 仅仅包含f
    # 修改nb_classes
    config['model_params']['nb_classes'] = label_size  # 实际的分类数量
    # 修改embedding表的shape
    for i, feature_name in enumerate(feature_names):
        config['model_params']['embed_params'][feature_name]['shape'] = [
            voc_sizes[i], feature_dim_dict[feature_name]
        ]  # 保存特征集合大小 和 embedding size
    # 修改char表的embedding
    if use_char_feature:
        # 默认16维,根据任务调整
        # 并且没有生成char mat: char_embed.pkl 这是事先给出的
        config['model_params']['embed_params']['char']['shape'] = [
            char_voc_size, 16
        ]  # 固定到16 dim
        config['model_params']['word_length'] = word_length
    # 修改句子长度
    config['model_params']['sequence_length'] = sequence_length
    config['model_params']['feature_names'] = useable
    # 根据数据情概,更新配置文件
    with codecs.open('./config.yml', 'w', encoding='utf-8') as file_w:
        yaml.dump(config, file_w)
    logger.info('preprocessing successfully!')
Beispiel #4
0
def main():
    print('preprocessing...')

    # 加载配置文件
    with open('./config.yml') as file_config:
        config = yaml.load(file_config)

    # 构建字典(同时获取词表size,序列最大长度)
    columns = config['model_params']['feature_names'] + ['label']
    min_counts_dict, path_vocs_dict = defaultdict(int), dict()
    feature_names = config['model_params']['feature_names']
    for feature_name in feature_names:
        min_counts_dict[feature_name] = \
            config['data_params']['voc_params'][feature_name]['min_count']
        path_vocs_dict[feature_name] = \
            config['data_params']['voc_params'][feature_name]['path']
    path_vocs_dict['label'] = \
        config['data_params']['voc_params']['label']['path']

    # char feature
    min_counts_dict['char'] = config['data_params']['voc_params']['char'][
        'min_count']
    path_vocs_dict['char'] = config['data_params']['voc_params']['char'][
        'path']

    sequence_len_pt = config['model_params']['sequence_len_pt']
    use_char_feature = config['model_params']['use_char_feature']
    word_len_pt = config['model_params']['word_len_pt']
    voc_sizes, lengths = build_vocabulary(
        path_data=config['data_params']['path_train'],
        columns=columns,
        min_counts_dict=min_counts_dict,
        path_vocs_dict=path_vocs_dict,
        sequence_len_pt=sequence_len_pt,
        use_char_featrue=use_char_feature,
        word_len_pt=word_len_pt)
    if not use_char_feature:
        sequence_length = lengths[0]
    else:
        sequence_length, word_length = lengths[:]

    # 构建embedding表
    feature_dim_dict = dict()  # 存储每个feature的dim
    for i, feature_name in enumerate(feature_names):
        path_pre_train = config['model_params']['embed_params'][feature_name][
            'path_pre_train']
        if not path_pre_train:
            if i == 0:
                feature_dim_dict[feature_name] = 64
            else:
                feature_dim_dict[feature_name] = 32
            continue
        path_pkl = config['model_params']['embed_params'][feature_name]['path']
        path_voc = config['data_params']['voc_params'][feature_name]['path']
        with open(path_voc, 'rb') as file_r:
            voc = pickle.load(file_r)
        embedding_dict, vec_dim = load_embed_from_txt(path_pre_train)
        feature_dim_dict[feature_name] = vec_dim
        embedding_matrix = np.zeros((len(voc.keys()) + 2, vec_dim),
                                    dtype='float32')
        for item in voc:
            if item in embedding_dict:
                embedding_matrix[voc[item], :] = embedding_dict[item]
            else:
                embedding_matrix[voc[item], :] = np.random.uniform(
                    -0.25, 0.25, size=(vec_dim))
        with open(path_pkl, 'wb') as file_w:
            pickle.dump(embedding_matrix, file_w)

    # 修改config中各个特征的shape,embedding大小默认为[64, 32, 32, ...]
    if use_char_feature:
        char_voc_size = voc_sizes.pop(0)
    label_size = voc_sizes[-1]
    voc_sizes = voc_sizes[:-1]
    # 修改nb_classes
    config['model_params']['nb_classes'] = label_size + 1
    # 修改embedding表的shape
    for i, feature_name in enumerate(feature_names):
        if i == 0:
            config['model_params']['embed_params'][feature_name]['shape'] = \
                [voc_sizes[i], feature_dim_dict[feature_name]]
        else:
            config['model_params']['embed_params'][feature_name]['shape'] = \
                [voc_sizes[i], feature_dim_dict[feature_name]]
    # 修改char表的embedding
    if use_char_feature:
        # 默认16维,根据任务调整
        config['model_params']['embed_params']['char']['shape'] = \
            [char_voc_size, 16]
        config['model_params']['word_length'] = word_length
    # 修改句子长度
    config['model_params']['sequence_length'] = sequence_length
    # 写入文件
    with codecs.open('./config.yml', 'w', encoding='utf-8') as file_w:
        yaml.dump(config, file_w)

    print('all done!')