Ejemplo n.º 1
0
def prepare_dataset_ner(sentences,
                        char_to_id_ner,
                        tag_to_id_ner,
                        lower=False,
                        train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    none_index = tag_to_id_ner["O"]
    num = 0

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        num += 1
        string = [w[0] for w in s]
        chars = [
            char_to_id_ner[f(w) if f(w) in char_to_id_ner else '<UNK>']
            for w in string
        ]
        segs = get_seg_features("".join(string))
        if train:
            tags = [tag_to_id_ner[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])
    return data
Ejemplo n.º 2
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x
    data = []
    for s in sentences:
        #string: 一个句子
        string = [w[0] for w in s]
        #chars:获得句子中的每个字 在字典中对应的位置 的列表
        chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>']
                 for w in string]
        #segs: 使用jieba分词  获取句子中的特征;0:一个字,1、3表示 1到3之间有几个字
        segs = get_seg_features("".join(string))
        # tags:获得句子中的每个字对应的目标值 在字典中对应的位置 的列表
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])
    return data
Ejemplo n.º 3
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - char list of sentences
        - char indexes
        - segment tag indexes
        - tag indexes(if use iobes tagset,  I-XXX will be replaced by E-XXX)
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x
    data = []
    for s in sentences:
        string = [w[0] for w in s]
        chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>']
                 for w in string]
        segs = get_seg_features("".join(string))
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])

    return data
Ejemplo n.º 4
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        string = [w[0] for w in s]
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        segs = get_seg_features("".join(string))
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append({
            "string": string,
            "chars": chars,
            "segs": segs,
            "tags": tags
        })

    return data
Ejemplo n.º 5
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - [0]text: cut by char
        - [1]char indexes
        - [2]word length
        - [3]tag indexes:
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        #print(sentences)
        string = [w[0] for w in s]
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        segs = get_seg_features("".join(string))
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])

    return data
Ejemplo n.º 6
0
def prepare_dataset(sentences, char_to_id, tag_to_id, train=True):

    none_index = tag_to_id['O']
    data = []

    for s in sentences:
        string, entity_types, entity_subtype, tags, dep_rels, dep_word_idx = list(
        ), list(), list(), list(), list(), list()
        for w in s:
            if w[0] != "...":
                string.append(w[0])  # token --> sentence
                entity_types.append(w[2])
                entity_subtype.append(w[3])
                tags.append(w[4])
                dep_rels.append(w[5])
                dep_word_idx.append(w[-1])
        if len(string) > 4:
            chars = [
                char_to_id[w if w in char_to_id else '<UNK>'] for w in string
            ]
            types = get_seg_features(string, entity_types)  # convert to id
            subtypes = get_sub_features(string,
                                        entity_subtype)  # convert to id
            dep_rel_features = get_dep_features(string, dep_rels)
            if train:
                tags = [tag_to_id[w] for w in tags]
            else:
                tags = [none_index for _ in chars]
            data.append([
                string, chars, types, subtypes, tags, dep_rel_features,
                dep_word_idx
            ])
    return data
Ejemplo n.º 7
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x
    data = []
    for s in sentences:
        string = [w[0] for w in s]
        chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>']
                 for w in string]
        segs = get_seg_features("".join(string))
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])

    return data
Ejemplo n.º 8
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    准备用于训练的dataset。
    :return:list of lists of dictionaries,包括:词id,字(char)id,标记(tag)id。

    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        # print(sentences)
        string = [w[0] for w in s]
        chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>']
                 for w in string]
        segs = get_seg_features("".join(string))
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])

    return data
Ejemplo n.º 9
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        string = [w[0] for w in s]  # Origin word list
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]  # list contains each lowcase word in sentence
        segs = get_seg_features(
            "".join(string))  # list contains word groups cut by jieba
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])

    return data
Ejemplo n.º 10
0
def prepare_dataset(sentences, char_to_id, tag_to_id, pos_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x
    data = []
    for s in sentences:
        string = [w[0] for w in s]
        chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>']
                 for w in string]
        segs = get_seg_features("".join(string))
        # pos = get_pos_features("".join(string))
        if train:
            # print(s)
            poss = [pos_to_id[w[-2] if w[-2] in pos_to_id else '<UNK>'] for w in s]
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            poss = [pos_to_id[w[-2] if w[-2] in pos_to_id else '<UNK>'] for w in s]
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, poss, tags])

    return data
Ejemplo n.º 11
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, test=False):
    """
    把文本型的样本和标签,转化为index,便于输入模型
    需要在每个样本和标签前后加<start>和<end>,
    但由于pytorch-crf这个包里面会自动添加<start>和<end>的转移概率,
    所以我们不用在手动加入。
    """
    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:

        chars = [w[0] for w in s]
        tags = [w[-1] for w in s]
        """ 句子转化为index """
        chars_idx = [
            char_to_id[f(c) if f(c) in char_to_id else '<unk>'] for c in chars
        ]
        """ 对句子分词,构造词的长度特征 """
        segs_idx = get_seg_features("".join(chars))

        if not test:
            tags_idx = [tag_to_id[t] for t in tags]

        else:
            tags_idx = [tag_to_id["<pad>"] for _ in tags]

        assert len(chars_idx) == len(segs_idx) == len(tags_idx)
        data.append([chars, chars_idx, segs_idx, tags_idx])

    return data
Ejemplo n.º 12
0
def prepare_dataset(sentences, char_to_id, tag_to_id, train=True):
    none_index = tag_to_id['O']
    data = []

    import json
    doc_file = "doc_dict.utf8"
    with codecs.open(doc_file, 'r', 'utf-8') as f:
        data_doc = f.readlines()
        doc_dict = json.loads(data_doc[0])

    for s in sentences:
        string, doc_id, entity_types, entity_subtype, tags = list(), list(
        ), list(), list(), list()
        for w in s:
            if w[0] != "...":
                string.append(w[0])
                doc_id.append(w[1])
                entity_types.append(w[-3])
                entity_subtype.append(w[-2])
                tags.append(w[-1])
        if len(string) > 4:
            chars = [
                char_to_id[w if w in char_to_id else '<UNK>'] for w in string
            ]
            doc_chars = get_doc_features(doc_id, char_to_id, doc_dict, chars)
            types = get_seg_features(string, entity_types)
            subtypes = get_sub_features(string, entity_subtype)
            if train:
                tags = [tag_to_id[w] for w in tags]
            else:
                tags = [none_index for _ in chars]
            data.append([string, doc_chars, chars, types, subtypes, tags])
    return data
Ejemplo n.º 13
0
def prepare_dataset_(sentences, char_to_id, tag_to_id, train=True):
    none_index = tag_to_id['O']
    data = []

    import json
    doc_file = "doc.utf8"
    with codecs.open(doc_file, 'r', 'utf-8') as f:
        data_doc = f.readlines()
        doc_dict = json.loads(data_doc[0])

    for s in sentences:
        string = [w[0] for w in s]
        doc_id = [w[1] for w in s]
        chars = [char_to_id[w if w in char_to_id else '<UNK>'] for w in string]
        doc_chars = get_doc_features(doc_id, char_to_id, doc_dict, string)

        entity_tags = [w[-3] for w in s]
        segs = get_seg_features(string, entity_tags)

        entity_subtype = [w[-2] for w in s]
        subtypes = get_sub_features(string, entity_subtype)

        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, doc_chars, chars, segs, subtypes, tags])
    return data
Ejemplo n.º 14
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    #变成可训练的数据模式
    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        #print(sentences)
        string = [w[0] for w in s]
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        segs = get_seg_features("".join(string))
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs,
                     tags])  #string,句子 chars字的id, segs分词id, tags标签id

    return data
Ejemplo n.º 15
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    格式化输入到网络中数据
    :param sentences:
    :param char_to_id:
    :param tag_to_id:
    :param lower:
    :param train:
    :return:
    """
    none_index = tag_to_id["O"]

    # 英文小写转化函数
    def to_lower(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        string = [w[0] for w in s]
        # 这里根据vocab转化ID,没有就用unk填充
        chars = [
            char_to_id[to_lower(w) if to_lower(w) in char_to_id else '<UNK>']
            for w in string
        ]
        # 这个位置提取分词特征,这传入的是之前一个一个词连接好的一句话
        segs = get_seg_features("".join(string))
        # 如果是训练就需要将标签转化
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        # 这里添加这4个输入
        data.append([string, chars, segs, tags])
    return data
Ejemplo n.º 16
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        string = [w[0] for w in s]  #句子 list
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]  #句子的idx list
        segs = get_seg_features("".join(string))  #分词特征list
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        # string:原始的句子的list
        # chars:句子中的char对应的id
        # segs:将句子进行分词后,每个char的标记
        # tags:NER tag的标签的id
        data.append([string, chars, segs, tags])

    return data
Ejemplo n.º 17
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes  每个单词在大字典中出现时的对应id
        - word char indexes
        - tag indexes
    """
    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:  # 对于训练集list中的每个已划分好的sentence句子
        string = [w[0] for w in s]  # string是sentence中除去tag的纯word,word用逗号分隔
        # ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        # segs = get_seg_features("".join(string))  # 调用data_utils.py中的get_seg_features函数,利用jieba划分词,生成代表分词后每个词的长度特征
        segs = get_seg_features(
            " ".join(string))  # 英文不能直接去除空格合并成一句话,还要保留空格以便之后取词
        # segs:[1, 3, 1, 3, 0, 0, 1, 3, 1, 3, 0, 0, 1, 3, 1, 3, 0, 1, 3, 0, 1, 2, 3, 0, 1, 2, 2, 3, 0, 0, 1, 3, 1, 3, 0, 0, 1, 2, 2, 3, 0]
        if train:
            tags = [tag_to_id[w[-1]] for w in s]  # tags是训练集标签出现的frequency
        else:
            tags = [none_index for _ in chars]
        data.append([
            string, chars, segs, tags
        ])  # output:训练集word字符——word的frequency——每句话分词后的word特征——标签的frequency
    return data
Ejemplo n.º 18
0
def prepare_dataset(sentences, word_to_id, tag_to_id, train=True):
    """
    数据预处理,返回list其实包含
    -word_list
    -word_id_list
    -word char indexs
    -tag_id_list
    :param sentences:
    :param word_to_id:
    :param tag_to_id:
    :param train:
    :return:
    """
    none_index = tag_to_id['O']

    data = []
    for s in sentences:
        word_list = [w[0] for w in s]
        word_id_list = [
            word_to_id[w if w in word_to_id else '<UNK>'] for w in word_list
        ]
        segs = data_utils.get_seg_features("".join(word_list))
        if train:
            tag_id_list = [tag_to_id[w[-1]] for w in s]
        else:
            tag_id_list = [none_index for w in s]
        data.append([word_list, word_id_list, segs, tag_id_list])

    return data
Ejemplo n.º 19
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Return:
        - word indexes
        - word char indexes
        - tag indexes
    """
    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        string = [w[0] for w in s]
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        segs = get_seg_features("".join(string))
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])

    return data
Ejemplo n.º 20
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    基于各映射字典对训练集和验证集的语句序列进行处理,得到将要输入模型的特征列表以及真实标签列表
    :param sentences:
    :param char_to_id:
    :param tag_to_id:
    :param lower:
    :param train:决定对训练集还是测试集进行处理,默认测试集没有标签,所以全部标注为0
    :return:
    """
    none_index = tag_to_id["O"]

    # print("none_index:", none_index)

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        string = [w[0] for w in s]
        # print(string)
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        segs = get_seg_features("".join(string))
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
            # print("tags", tags)
        else:
            tags = [none_index for _ in chars]
        # print("chars:", chars)
        data.append([string, chars, segs, tags])
    # print(segs)
    return data
Ejemplo n.º 21
0
def input_from_line(line, char_to_id):
    """
    Take sentence data and return an input for
    the training or the evaluation function.
    """
    line = data_utils.strQ2B(line)
    line = data_utils.replace_html(line)
    inputs = []
    inputs.append([line])
    line.replace(" ", "$")
    inputs.append([[
        char_to_id[char] if char in char_to_id else char_to_id[UNK]
        for char in line
    ]])
    inputs.append([data_utils.get_seg_features(line)])
    inputs.append([[]])
    return inputs
Ejemplo n.º 22
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    # 返回数据格式是:[[[]]] 三个标记分别是字符串,字符串对应的ID,经过结巴分词后词语的长度,字对应的标签的ID
    # [[['入', '院', '情', '况', ':', '女', ',', '0', '0', '岁', ',', '以', '突', '发', '言',
    # '语', '不', '清', '0', '天', ',', '加', '重', '0', '天', '入', '院', '。', '入', '院', '情',
    # '况', ':', '患', '者', '以', '腰', '痛', '伴', '双', '下', '肢', '疼', '痛', '半', '年',
    # ',', '加', '重', '0', '0', '余', '天', '为', '主', '诉', '入', '院', '。'],
    #  [3, 4, 11, 9, 14, 24, 5, 2, 2, 25, 5, 8, 30, 23, 34, 36, 16, 28, 2, 6, 5,
    #  10, 13, 2, 6, 3, 4, 7, 3, 4, 11, 9, 14, 27, 31, 8, 33, 12, 19, 22, 15, 32, 29,
    #  12, 21, 26, 5, 10, 13, 2, 2, 20, 6, 17, 18, 35, 3, 4, 7],
    #  [1, 3, 1, 3, 0, 0, 0, 1, 3, 0, 0, 0, 1, 3, 1, 3, 1, 3, 0, 0, 0, 1, 3, 0, 0, 1,
    #   3, 0, 1, 3, 1, 3, 0, 1, 3, 0, 1, 3, 0, 1, 2, 3, 1, 3, 1, 3, 0, 1, 3, 1, 3, 1, 3,
    #   1, 3, 0, 1, 3, 0],
    #  [0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 5, 5, 2, 0, 0, 0,
    #   0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 6, 8, 7, 1, 2, 0, 0, 0, 0, 0,
    #   0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]]
    # 这里将数据进行拼接返回,分别包含了word index(在字典中的序列数)word char index(单独按照数字的排列) 字符tag对应的标签的ID
    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        # print(sentences)
        string = [w[0] for w in s]
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        # 这里获取单词长度的标签,将一个单词标记为122223的格式,其中1是开始字符,2是中间字,3是结束,如果是单个字则直接标记为0
        #
        # 用结巴分词对单词进行分词,将单词转换歘122223格式例如seg_feature[1, 3, 1, 3, 0, 0, 0]对应的就是两个单词长度为2的和三个字
        segs = get_seg_features("".join(string))
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])
    # 这里data的数据是string chars segs tags的合集
    return data
Ejemplo n.º 23
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        try:
            # print("s:{}".format(s))
            # "s:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'E-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'E-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'E-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'E-TYPE']]"
            string = [w[0] for w in s]
            # print("string:{}".format(string))
            # "string:['我', '要', '看', '乌', '鲁', '木', '齐', '市', '第', '四', '十', '九', '中', '学', '东', '门', '去', '乌', '鲁', '木', '齐', '推', '拿', '职', '业', '学', '校', '南', '门', '沿', '西', '虹', '东', '路', '的', '监', '控']"
            chars = [
                char_to_id[f(w) if f(w) in char_to_id else '<UNK>']
                for w in string
            ]
            # print("chars:{}".format(chars))
            # "chars:[15, 53, 26, 52, 54, 48, 51, 58, 72, 108, 74, 173, 42, 46, 32, 5, 44, 52, 54, 48, 51, 526, 525, 197, 100, 46, 85, 31, 5, 87, 39, 782, 32, 43, 6, 62, 61]"
            segs = get_seg_features("".join(string))
            # print("segs:{}".format(segs))
            # segs: [1, 3, 0, 1, 2, 2, 2, 3, 1, 2, 2, 3, 1, 3, 1, 3, 0, 1, 2, 2, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 2, 3, 1, 3,
            # 0, 1, 3]
            if train:
                tags = [tag_to_id[w[-1]] for w in s]
            else:
                tags = [none_index for _ in chars]
            # print("string:{}\nchars:{}\nsegs:{}\ntags:{}\n".format(string, chars, segs, tags))
            # "string:['我', '要', '看', '乌', '鲁', '木', '齐', '市', '第', '四', '十', '九', '中', '学', '东', '门', '去', '乌', '鲁', '木', '齐', '推', '拿', '职', '业', '学', '校', '南', '门', '沿', '西', '虹', '东', '路', '的', '监', '控']
            # chars:[15, 53, 26, 52, 54, 48, 51, 58, 72, 108, 74, 173, 42, 46, 32, 5, 44, 52, 54, 48, 51, 526, 525, 197, 100, 46, 85, 31, 5, 87, 39, 782, 32, 43, 6, 62, 61]
            # segs:[1, 3, 0, 1, 2, 2, 2, 3, 1, 2, 2, 3, 1, 3, 1, 3, 0, 1, 2, 2, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 2, 3, 1, 3, 0, 1, 3]
            # tags:[1, 1, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 1, 27, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 28, 1, 29, 26, 26, 30, 1, 5, 6]"exit()
            data.append([string, chars, segs, tags])
        except:
            continue

    return data
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        string = [w[0] for w in s]
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        segs = get_seg_features("".join(string))
        '''
            string 储存的是每个句子里的字 
            chars  储存的是每个句子里的字对应的id,如果字典中不存在这个字,则返回<'UNK'>(感觉是Unkown的意思)对应的id
            segs   存储的是用jieba分词分词后,用基于词长度划分的特征的组合,例如[1,3,|0,|0,|1,2,2,3]:
                   词长   特征(feature)
                    1    [0]
                    2    [1,3]
                    3    [1,2,3]
                    4    [1,2,2,3]
                    5    [1,2,2,2,3]
                   ...   ...
                   注:"char".join([strlist]), 意识是将一个字符list中的元素用""中对应的字符连接,这里为空,所以直接连接
        '''

        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])
        # data 以句子为单位存储[字符,字符id,标签id/chars长度的全是“0”对应标签id的list,标签]

    return data
Ejemplo n.º 25
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
        
    Args:
      sentences: 传入的句子(字符与对应的tag标记)
      char_to_id: 字符与位置的映射关系
      tag_to_id: tag标记与位置的映射关系
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        string = [w[0] for w in s]
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        segs = get_seg_features("".join(string))
        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        #string 训练数据集的第一个句子
        #['无', '长', '期', '0', '0', '0', '年', '0', '0', '月', '0', '0', '日', '出', '院', '记', '录', '患', '者', '姓', '名', ':', '闫', 'X', 'X', '性', '别', ':', '男', '年', '龄', ':', '0', '0', '岁', '入', '院', '日', '期', ':', '0', '0', '0', '0', '年', '0', '0', '月', '0', '0', '日', '0', '0', '时', '0', '0', '分', '出', '院', '日', '期', ':', '0', '0', '0', '0', '年', '0', '0', '月', '0', '0', '日', '0', '0', '时', '0', '0', '分', '共', '住', '院', '0', '0', '天', '。']
        #chars 句子中每个字符在字典中的位置
        #[6, 297, 109, 2, 2, 2, 34, 2, 2, 54, 2, 2, 50, 29, 11, 138, 204, 40, 37, 205, 182, 4, 1309, 78, 78, 10, 168, 4, 359, 34, 236, 4, 2, 2, 176, 48, 11, 50, 109, 4, 2, 2, 2, 2, 34, 2, 2, 54, 2, 2, 50, 2, 2, 43, 2, 2, 30, 29, 11, 50, 109, 4, 2, 2, 2, 2, 34, 2, 2, 54, 2, 2, 50, 2, 2, 43, 2, 2, 30, 338, 71, 11, 2, 2, 122, 5]
        #segs jieba分词后句子每个词语的长度 0 表示单个字 1表示词语的开头 2表示词语的中间词 3表示词语的结尾
        #[0, 1, 3, 1, 2, 3, 0, 1, 3, 0, 1, 3, 0, 1, 3, 1, 3, 1, 3, 1, 3, 0, 0, 1, 3, 1, 3, 0, 0, 1, 3, 0, 1, 3, 0, 1, 3, 1, 3, 0, 1, 2, 2, 3, 0, 1, 3, 0, 1, 3, 0, 1, 3, 0, 1, 3, 0, 1, 3, 1, 3, 0, 1, 2, 2, 3, 0, 1, 3, 0, 1, 3, 0, 1, 3, 0, 1, 3, 1, 3, 1, 3, 1, 3, 0, 0]
        #tags 句子中对应的tag标记在字典(tag_to_id)中的位置
        #[0, 43, 44, 0, 0, 0, 0, 0, 0, 0, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 16, 0, 0, 0, 0, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]
        data.append([string, chars, segs, tags])

    return data
Ejemplo n.º 26
0
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    #sentences ��������
    #char_to_id ��id
    #tag_to_id  ��עid
    none_index = tag_to_id["O"]  #0

    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:  #s��һ������ [['��', 'O'], ['��', 'B-Dur'], ['��', 'E-Dur'], ['0', 'O'], ['0', 'O'], ['0', 'O'], ['��', 'O'], ['0', 'O'], ['0', 'O'], ['��', 'O'], ['0', 'B-DRU'], ['0', 'E-DRU'], ['��', 'O'], ['��', 'O'], ['Ժ', 'O'], ['��', 'O'], ['¼', 'O'], ['��', 'O'], ['��', 'O'], ['��', 'O'], ['��', 'O'], ['��', 'O'], ['��', 'O'], ['X', 'O'], ['X', 'O'], ['��', 'O'], ['��', 'O'], ['��', 'O'], ['��', 'O'], ['��', 'O'], ['��', 'O'], ['��', 'O'], ['0', 'O'], ['0', 'O'], ['��', 'O'], ['��', 'O'], ['Ժ', 'O'], ['��', 'O'], ['��', 'O'], ['��', 'O'], ['0', 'O'], ['0', 'O'], ['0', 'O'], ['0', 'O'], ['��', 'O'], ['0', 'O'], ['0', 'O'], ['��', 'O'], ['0', 'O'], ['0', 'O'], ['��', 'O'], ['0', 'O'], ['0', 'O'], ['ʱ', 'O'], ['0', 'O'], ['0', 'O'], ['��', 'O'], ['��', 'O'], ['Ժ', 'O'], ['��', 'O'], ['��', 'O'], ['��', 'O'], ['0', 'O'], ['0', 'O'], ['0', 'O'], ['0', 'O'], ['��', 'O'], ['0', 'O'], ['0', 'O'], ['��', 'O'], ['0', 'B-DRU'], ['0', 'E-DRU'], ['��', 'O'], ['0', 'O'], ['0', 'O'], ['ʱ', 'O'], ['0', 'B-DRU'], ['0', 'E-DRU'], ['��', 'O'], ['��', 'O'], ['ס', 'O'], ['Ժ', 'O'], ['0', 'O'], ['0', 'O'], ['��', 'O'], ['��', 'O']]
        #print(sentences)
        string = [w[0] for w in s]
        #string ['��', '��', '��', '0', '0', '0', '��', '0', '0', '��', '0', '0', '��', '��', 'Ժ', '��', '¼', '��', '��', '��', '��', '��', '��', 'X', 'X', '��', '��', '��', '��', '��', '��', '��', '0', '0', '��', '��', 'Ժ', '��', '��', '��', '0', '0', '0', '0', '��', '0', '0', '��', '0', '0', '��', '0', '0', 'ʱ', '0', '0', '��', '��', 'Ժ', '��', '��', '��', '0', '0', '0', '0', '��', '0', '0', '��', '0', '0', '��', '0', '0', 'ʱ', '0', '0', '��', '��', 'ס', 'Ժ', '0', '0', '��', '��']
        # char_to_id �̶�{'<PAD>': 0, '<UNK>': 1, '0': 2, '��': 3, '��': 4, '��': 5, '��': 6, '��': 7, '��': 8, '��': 9, '��': 10, '��': 11, 'Ժ': 12, '��': 13, '��': 14, 'ʷ': 15, 'Ѫ': 16, '��': 17, '��': 18, '˫': 19, 'δ': 20, 'm': 21, '��': 22, '��': 23, '��': 24, '��': 25, '��': 26, '.': 27, '��': 28, '��': 29, '��': 30, 'I': 31, '֫': 32, '��': 33, '��': 34, '��': 35, '��': 36, '��': 37, '/': 38, '��': 39, 'ʹ': 40, '��': 41, '��': 42, 'ʱ': 43, '��': 44, '��': 45, '��': 46, '��': 47, '��': 48, '��': 49, '��': 50, '��': 51, '��': 52, '��': 53, '��': 54, '��': 55, '��': 56, 'ѹ': 57, '��': 58, '��': 59, '��': 60, '��': 61, '��': 62, '��': 63, 'Ĥ': 64, ':': 65, '��': 66, '��': 67, 'ס': 68, '��': 69, '��': 70, 'ҽ': 71, 'ͷ': 72, '��': 73, '��': 74, 'ϵ': 75, '��': 76, 'g': 77, 'X': 78, '��': 79, '��': 80, '��': 81, '��': 82, '-': 83, '��': 84, '��': 85, '��': 86, '��': 87, '��': 88, '��': 89, 'ǰ': 90, '֢': 91, '��': 92, '��': 93, 'V': 94, '��': 95, '��': 96, ',': 97, '��': 98, '��': 99, 'һ': 100, '��': 101, '��': 102, '״': 103, '��': 104, '��': 105, '��': 106, '��': 107, '��': 108, '��': 109, '��': 110, '��': 111, '��': 112, '��': 113, '��': 114, 'С': 115, '��': 116, '��': 117, '��': 118, '��': 119, '��': 120, 'ʾ': 121, '��': 122, '��': 123, '��': 124, '��': 125, '��': 126, 'ͻ': 127, '��': 128, '׵': 129, 'T': 130, '��': 131, '��': 132, '��': 133, '��': 134, '��': 135, '��': 136, '��': 137, '��': 138, '��': 139, '��': 140, 'ƽ': 141, '��': 142, '��': 143, '��': 144, 'ˮ': 145, '��': 146, '��': 147, 'e': 148, 'ʦ': 149, '��': 150, '��': 151, '��': 152, 'H': 153, 'λ': 154, '��': 155, '��': 156, '��': 157, '��': 158, '­': 159, '��': 160, '��': 161, '��': 162, '��': 163, '��': 164, '��': 165, 'ֱ': 166, '��': 167, '��': 168, '��': 169, 'R': 170, 'Ϊ': 171, 'Ƭ': 172, '��': 173, '��': 174, '��': 175, '��': 176, 'P': 177, 'L': 178, '��': 179, '��': 180, '��': 181, '��': 182, '��': 183, 'ҩ': 184, 'Ƥ': 185, 'Ⱦ': 186, '��': 187, '��': 188, '��': 189, '��': 190, '־': 191, '��': 192, '��': 193, '��': 194, '��': 195, 'ͳ': 196, '��': 197, 'ʳ': 198, '��': 199, '��': 200, '��': 201, 'r': 202, '��': 203, '��': 204, '¼': 205, '��': 206, '��': 207, '��': 208, '��': 209, '��': 210, '��': 211, '��': 212, '��': 213, '��': 214, '��': 215, '��': 216, '��': 217, '��': 218, 'ÿ': 219, '��': 220, 'ת': 221, 'B': 222, '��': 223, '��': 224, '��': 225, '��': 226, 'o': 227, '��': 228, '��': 229, 'C': 230, '��': 231, '��': 232, 'i': 233, '��': 234, 'n': 235, '��': 236, '��': 237, '��': 238, 'dz': 239, '��': 240, '��': 241, '��': 242, '��': 243, 'ʪ': 244, '?': 245, '��': 246, '��': 247, 'Ů': 248, 'ƫ': 249, '��': 250, '��': 251, '��': 252, '��': 253, '��': 254, '��': 255, '��': 256, '��': 257, '��': 258, 'l': 259, 'ߵ': 260, '��': 261, '��': 262, '��': 263, '��': 264, '��': 265, '��': 266, '��': 267, '��': 268, '��': 269, '��': 270, '��': 271, '��': 272, '��': 273, '��': 274, '��': 275, '��': 276, '��': 277, '��': 278, 'Ӧ': 279, '��': 280, '��': 281, '֬': 282, '��': 283, '��': 284, 'Σ': 285, ')': 286, '��': 287, 'ȱ': 288, '��': 289, '��': 290, '��': 291, '��': 292, '(': 293, '��': 294, '��': 295, '��': 296, '��': 297, '׼': 298, '��': 299, '��': 300, '��': 301, '��': 302, '��': 303, '��': 304, '��': 305, 'ϥ': 306, '��': 307, '��': 308, '��': 309, '��': 310, '��': 311, '��': 312, 'ʧ': 313, '��': 314, 'б': 315, '��': 316, '��': 317, '��': 318, '��': 319, '��': 320, '��': 321, '��': 322, '��': 323, '��': 324, '��': 325, '��': 326, '��': 327, '��': 328, 'Ƣ': 329, '��': 330, '��': 331, '��': 332, '��': 333, '��': 334, '��': 335, '��': 336, '��': 337, '��': 338, '��': 339, 'ȷ': 340, '��': 341, 'Բ': 342, '��': 343, '��': 344, 'Ե': 345, '��': 346, '��': 347, '��': 348, '��': 349, '��': 350, '̬': 351, '��': 352, '��': 353, '��': 354, 'ǻ': 355, '��': 356, '��': 357, '��': 358, '��': 359, '��': 360, 'ͫ': 361, '��': 362, '��': 363, '��': 364, '��': 365, '��': 366, '֧': 367, '��': 368, 'ȫ': 369, '��': 370, '��': 371, '��': 372, '��': 373, 'ʯ': 374, '��': 375, 'ǿ': 376, 'u': 377, 'S': 378, 'ǩ': 379, '��': 380, '��': 381, '��': 382, '��': 383, '��': 384, '��': 385, 'ʶ': 386, '��': 387, 'b': 388, '��': 389, '��': 390, '��': 391, '��': 392, '��': 393, 's': 394, '��': 395, '��': 396, '��': 397, '��': 398, '��': 399, '��': 400, '��': 401, '��': 402, '��': 403, 'M': 404, 'A': 405, 'խ': 406, '�': 407, '˯': 408, 'Ӫ': 409, '��': 410, '��': 411, 'Ϣ': 412, '��': 413, '��': 414, '��': 415, '��': 416, '��': 417, '��': 418, '��': 419, 'ָ': 420, 'ͼ': 421, '��': 422, '��': 423, '��': 424, '��': 425, '��': 426, 'ʵ': 427, '��': 428, '��': 429, '��': 430, '��': 431, '��': 432, '��': 433, '��': 434, '��': 435, '��': 436, '��': 437, '��': 438, '��': 439, '��': 440, '��': 441, '��': 442, '��': 443, 'Լ': 444, '��': 445, 'ԭ': 446, '��': 447, '��': 448, '��': 449, 'Χ': 450, '��': 451, '��': 452, '��': 453, '��': 454, '��': 455, '��': 456, '��': 457, '��': 458, '��': 459, '��': 460, '��': 461, '��': 462, '�': 463, '��': 464, 'ĸ': 465, '��': 466, '��': 467, '��': 468, '��': 469, '��': 470, '��': 471, 'Ż': 472, '��': 473, '��': 474, 'Ǻ': 475, '��': 476, '��': 477, 'Ԥ': 478, 'ҵ': 479, '��': 480, '��': 481, '��': 482, '��': 483, 'ְ': 484, '��': 485, '¡': 486, '��': 487, '��': 488, '��': 489, '�': 490, '��': 491, '��': 492, '��': 493, '��': 494, '��': 495, '��': 496, 'ַ': 497, '��': 498, '��': 499, '��': 500, '��': 501, '��': 502, '��': 503, '��': 504, '��': 505, 'd': 506, '��': 507, '̹': 508, 'ũ': 509, '��': 510, '��': 511, '��': 512, '��': 513, '��': 514, '��': 515, '��': 516, 'Ӱ': 517, '��': 518, '��': 519, '��': 520, '��': 521, 'ר': 522, '��': 523, '��': 524, '��': 525, '��': 526, '��': 527, '��': 528, '��': 529, '��': 530, '��': 531, 'ϸ': 532, 'ֳ': 533, '��': 534, '��': 535, '��': 536, '��': 537, '��': 538, '��': 539, '��': 540, '��': 541, '+': 542, '��': 543, 'ľ': 544, '�': 545, 'ή': 546, '��': 547, '��': 548, 'ӯ': 549, 'ë': 550, '��': 551, '��': 552, '��': 553, '��': 554, '��': 555, '��': 556, '˹': 557, '��': 558, '��': 559, '��': 560, '��': 561, '��': 562, 'Ӳ': 563, '��': 564, '��': 565, '��': 566, '��': 567, '��': 568, '��': 569, '��': 570, 'Զ': 571, '̧': 572, 'ұ': 573, '֪': 574, '��': 575, '��': 576, 'Ұ': 577, '��': 578, 'ѭ': 579, '��': 580, '��': 581, '��': 582, '��': 583, '��': 584, 'ֲ': 585, '��': 586, '��': 587, 'θ': 588, '��': 589, 'ע': 590, 'ż': 591, '��': 592, '��': 593, '��': 594, '��': 595, 'Ҷ': 596, '��': 597, '��': 598, '��': 599, '��': 600, '��': 601, 'Ӻ': 602, '��': 603, '��': 604, '��': 605, '��': 606, '��': 607, '��': 608, '��': 609, 'ø': 610, '��': 611, '˾': 612, '��': 613, '��': 614, '��': 615, '��': 616, 'W': 617, '��': 618, '��': 619, '��': 620, '��': 621, 'ʡ': 622, 'Ҫ': 623, 'c': 624, '��': 625, 'ճ': 626, '��': 627, '��': 628, '��': 629, '��': 630, '��': 631, '��': 632, '��': 633, '��': 634, '��': 635, '��': 636, 'ҧ': 637, 'ƥ': 638, '��': 639, '%': 640, '��': 641, '��': 642, '��': 643, '��': 644, '��': 645, '��': 646, '��': 647, '��': 648, '��': 649, '��': 650, '��': 651, '͡': 652, '©': 653, '��': 654, '��': 655, 'ķ': 656, '��': 657, '��': 658, '��': 659, 'K': 660, '��': 661, '�': 662, '��': 663, '��': 664, 'ͨ': 665, '��': 666, '��': 667, '*': 668, 'p': 669, '��': 670, 'Ħ': 671, '��': 672, '��': 673, '��': 674, '��': 675, 'z': 676, '��': 677, 'k': 678, '��': 679, '��': 680, '��': 681, '��': 682, '��': 683, '��': 684, '��': 685, '��': 686, '��': 687, '��': 688, '��': 689, '��': 690, '��': 691, '��': 692, '��': 693, '=': 694, '��': 695, '��': 696, '��': 697, '��': 698, '϶': 699, '��': 700, '��': 701, 'ɫ': 702, 'h': 703, '��': 704, 'ŭ': 705, '��': 706, '��': 707, 'Ƶ': 708, 'ҹ': 709, '��': 710, 'Э': 711, 'Һ': 712, '��': 713, '�': 714, '΢': 715, '��': 716, '��': 717, 'y': 718, '��': 719, '��': 720, '��': 721, '��': 722, '��': 723, '��': 724, 'ͬ': 725, '��': 726, '��': 727, '��': 728, '��': 729, '��': 730, '��': 731, 'Ƿ': 732, '��': 733, '��': 734, '��': 735, '��': 736, '��': 737, '��': 738, '��': 739, '��': 740, '��': 741, '��': 742, '��': 743, '��': 744, '��': 745, 'Ȼ': 746, '��': 747, '��': 748, '}': 749, '{': 750, '��': 751, 'Ч': 752, '��': 753, '��': 754, '��': 755, '��': 756, '��': 757, '��': 758, 'D': 759, '��': 760, '��': 761, '��': 762, '��': 763, '��': 764, '��': 765, '��': 766, '��': 767, '��': 768, '��': 769, '��': 770, '��': 771, '��': 772, '��': 773, '˨': 774, '��': 775, '��': 776, 'ɽ': 777, '��': 778, '��': 779, '��': 780, '��': 781, '��': 782, '��': 783, '��': 784, '��': 785, '��': 786, '��': 787, 'ǣ': 788, '��': 789, '��': 790, 'ׯ': 791, '��': 792, '��': 793, '��': 794, '��': 795, '��': 796, '��': 797, '��': 798, 'U': 799, '��': 800, '��': 801, '��': 802, '֮': 803, 'ζ': 804, '��': 805, '��': 806, '��': 807, '��': 808, '��': 809, '��': 810, '��': 811, '��': 812, '��': 813, '��': 814, 'ģ': 815, '��': 816, '��': 817, '��': 818, '��': 819, '��': 820, '̱': 821, '��': 822, '��': 823, '��': 824, '��': 825, '��': 826, '��': 827, '��': 828, '��': 829, '��': 830, '��': 831, 'Ŀ': 832, '��': 833, 'ĩ': 834, '��': 835, '��': 836, '��': 837, '��': 838, 'κ': 839, '��': 840, '��': 841, '��': 842, '̵': 843, '��': 844, '��': 845, '��': 846, '��': 847, '��': 848, 'q': 849, 'F': 850, '��': 851, '��': 852, '��': 853, '��': 854, '��': 855, '��': 856, '��': 857, '��': 858, '��': 859, '��': 860, '��': 861, '��': 862, '��': 863, '��': 864, '��': 865, '��': 866, '��': 867, 'Ա': 868, '��': 869, '��': 870, '��': 871, 'Դ': 872, '��': 873, '��': 874, '��': 875, '·': 876, '��': 877, '��': 878, 'ʼ': 879, '��': 880, '��': 881, '��': 882, '��': 883, '��': 884, '��': 885, '��': 886, '��': 887, 'վ': 888, '��': 889, 'ֹ': 890, '��': 891, '��': 892, '��': 893, 'ĭ': 894, '��': 895, '��': 896, '��': 897, '��': 898, '��': 899, '��': 900, '��': 901, '��': 902, '��': 903, '��': 904, '��': 905, '��': 906, 'ɥ': 907, '��': 908, '��': 909, 'Ƽ': 910, '��': 911, '��': 912, 'ȥ': 913, '��': 914, '��': 915, '��': 916, '��': 917, '��': 918, '��': 919, '��': 920, '��': 921, '��': 922, '��': 923, 'ά': 924, 'ͣ': 925, '��': 926, '��': 927, '��': 928, '��': 929, '��': 930, '��': 931, '��': 932, '��': 933, '��': 934, '��': 935, '��': 936, '��': 937, '��': 938, '��': 939, '��': 940, '��': 941, '��': 942, 'ɳ': 943, '��': 944, '³': 945, '��': 946, '��': 947, '��': 948, '��': 949, '��': 950, 'ɢ': 951, '��': 952, '��': 953, '��': 954, '��': 955, '��': 956, '��': 957, '��': 958, '��': 959, '��': 960, 't': 961, '��': 962, 'ɨ': 963, '��': 964, '��': 965, '��': 966, '¥': 967, '��': 968, 'N': 969, '��': 970, '��': 971, '��': 972, '��': 973, '��': 974, '��': 975, '��': 976, '��': 977, '��': 978, '��': 979, '��': 980, '��': 981, '��': 982, '��': 983, '��': 984, '��': 985, '��': 986, '��': 987, '��': 988, '��': 989, '��': 990, '��': 991, '��': 992, '��': 993, '��': 994, '��': 995, '��': 996, 'ΰ': 997, '��': 998, '��': 999, '��': 1000, 'a': 1001, '��': 1002, '��': 1003, '��': 1004, '��': 1005, '��': 1006, 'v': 1007, 'ѧ': 1008, '��': 1009, '��': 1010, '��': 1011, '��': 1012, '��': 1013, 'ͦ': 1014, '��': 1015, '��': 1016, '��': 1017, '��': 1018, '��': 1019, 'ù': 1020, '��': 1021, '��': 1022, '��': 1023, '�': 1024, '��': 1025, '��': 1026, '��': 1027, '��': 1028, '��': 1029, '��': 1030, '��': 1031, '��': 1032, '��': 1033, '��': 1034, '��': 1035, '��': 1036, 'ţ': 1037, '��': 1038, '��': 1039, '��': 1040, '��': 1041, 'ɭ': 1042, 'ѵ': 1043, '��': 1044, '��': 1045, '��': 1046, 'ͩ': 1047, 'ϰ': 1048, '��': 1049, '��': 1050, '��': 1051, '��': 1052, '��': 1053, '��': 1054, '��': 1055, '��': 1056, '��': 1057, '��': 1058, 'ƶ': 1059, '˳': 1060, '��': 1061, '��': 1062, '��': 1063, '��': 1064, '��': 1065, '��': 1066, '��': 1067, '��': 1068, '��': 1069, 'Q': 1070, '��': 1071, '��': 1072, '��': 1073, '��': 1074, '�': 1075, '��': 1076, '��': 1077, '��': 1078, '��': 1079, '��': 1080, '��': 1081, '��': 1082, '��': 1083, '��': 1084, '��': 1085, '��': 1086, '��': 1087, '��': 1088, '��': 1089, '��': 1090, '�': 1091, '��': 1092, '��': 1093, 'Ф': 1094, '��': 1095, '��': 1096, 'ʰ': 1097, '��': 1098, 'ī': 1099, '��': 1100, '֩': 1101, '��': 1102, '��': 1103, '��': 1104, '��': 1105, 'E': 1106, '�': 1107, '��': 1108, '��': 1109, '��': 1110, '��': 1111, '��': 1112, '��': 1113, '̩': 1114, '��': 1115, '��': 1116, '��': 1117, '��': 1118, 'Ծ': 1119, '��': 1120, '��': 1121, '��': 1122, '��': 1123, '��': 1124, '��': 1125, '��': 1126, 'β': 1127, '��': 1128, 'ʮ': 1129, 'Ī': 1130, '˥': 1131, '�': 1132, '��': 1133, '��': 1134, '��': 1135, '��': 1136, '��': 1137, '��': 1138, '��': 1139, '˻': 1140, 'ѡ': 1141, '��': 1142, '��': 1143, '��': 1144, '��': 1145, 'G': 1146, '��': 1147, 'Ʒ': 1148, '��': 1149, '��': 1150, 'Ԫ': 1151, '��': 1152, '��': 1153, '��': 1154, '��': 1155, '��': 1156, '��': 1157, '��': 1158, '��': 1159, '��': 1160, '��': 1161, '��': 1162, '��': 1163, '��': 1164, 'ͪ': 1165, '��': 1166, '��': 1167, '��': 1168, '��': 1169, '��': 1170, '��': 1171, '��': 1172, '��': 1173, '��': 1174, '��': 1175, '��': 1176, '��': 1177, '��': 1178, '��': 1179, '��': 1180, '��': 1181, '��': 1182, '��': 1183, '��': 1184, '��': 1185, '��': 1186, 'Ѩ': 1187, '͸': 1188, '��': 1189, '��': 1190, '��': 1191, '��': 1192, '��': 1193, '��': 1194, '��': 1195, 'л': 1196, '��': 1197, 'O': 1198, 'ʽ': 1199, '��': 1200, '��': 1201, '��': 1202, 'Խ': 1203, '��': 1204, '��': 1205, '��': 1206, '��': 1207, '��': 1208, '��': 1209, '��': 1210, '��': 1211, '��': 1212, '��': 1213, '��': 1214, '��': 1215, '��': 1216, '��': 1217, '��': 1218, '��': 1219, '��': 1220, '��': 1221, '��': 1222, '��': 1223, '��': 1224, '��': 1225, '��': 1226, '��': 1227, 'ֵ': 1228, '��': 1229, '��': 1230, 'ʹ': 1231, 'չ': 1232, 'ǹ': 1233, '��': 1234, '̥': 1235, '��': 1236, '��': 1237, '��': 1238, '��': 1239, '��': 1240, '��': 1241, '֣': 1242, '��': 1243, '��': 1244, '��': 1245, '��': 1246, 'Į': 1247, '��': 1248, '��': 1249, '��': 1250, 'ֺ': 1251, '֯': 1252, '��': 1253, '��': 1254, '��': 1255, '��': 1256, '��': 1257, '��': 1258, '��': 1259, '��': 1260, '��': 1261, '��': 1262, '��': 1263, '��': 1264, '԰': 1265, '׹': 1266, '��': 1267, '��': 1268, '½': 1269, '��': 1270, '��': 1271, '��': 1272, '��': 1273, '��': 1274, 'к': 1275, '��': 1276, '��': 1277, '��': 1278, 'ѩ': 1279, '��': 1280, '��': 1281, '��': 1282, '��': 1283, '��': 1284, 'Ĵ': 1285, '��': 1286, '��': 1287, 'ռ': 1288, '��': 1289, '��': 1290, '��': 1291, '̦': 1292, '��': 1293, '��': 1294, '��': 1295, '��': 1296, '��': 1297, '׿': 1298, '��': 1299, '��': 1300, '��': 1301, '��': 1302, '��': 1303, '��': 1304, 'կ': 1305, '��': 1306, '��': 1307, '��': 1308, 'ð': 1309, '��': 1310, '��': 1311, '��': 1312, '��': 1313, '��': 1314, '��': 1315, '��': 1316, '��': 1317, '��': 1318, 'Y': 1319, '��': 1320, '��': 1321, '��': 1322, '��': 1323, 'ŵ': 1324, '��': 1325, '��': 1326, '��': 1327, 'ѣ': 1328, 'ɸ': 1329, '��': 1330, ';': 1331, '��': 1332, '��': 1333, 'Ѭ': 1334, 'ű': 1335, 'Ȭ': 1336, '��': 1337, '��': 1338, '��': 1339, '��': 1340, '߻': 1341, '��': 1342, '��': 1343, '��': 1344, '��': 1345, '��': 1346, '��': 1347, '��': 1348, '��': 1349, '��': 1350, '��': 1351, '��': 1352, '��': 1353, '��': 1354, '��': 1355, '��': 1356, '��': 1357, 'ժ': 1358, '��': 1359, '��': 1360, 'Ӣ': 1361, '��': 1362, '��': 1363, '��': 1364, '��': 1365, '��': 1366, '��': 1367, '��': 1368, '��': 1369, '��': 1370, '��': 1371, '��': 1372, 'ֻ': 1373, '��': 1374, '��': 1375, '��': 1376, '��': 1377, '��': 1378, '��': 1379, '��': 1380, '��': 1381, '��': 1382, '��': 1383, '��': 1384, '��': 1385, '��': 1386, '��': 1387, '��': 1388, 'ȡ': 1389, '��': 1390, '��': 1391, 'ܺ': 1392, 'Ԭ': 1393, '��': 1394, '��': 1395, '��': 1396, 'Ͱ': 1397, '��': 1398, '��': 1399, '��': 1400, '��': 1401, '��': 1402, '��': 1403, '��': 1404, '��': 1405, '��': 1406, '��': 1407, '��': 1408, '��': 1409, '¶': 1410, '��': 1411, '��': 1412, '��': 1413, '��': 1414, '��': 1415, '��': 1416, '��': 1417, '��': 1418, 'Ъ': 1419, '��': 1420, '��': 1421, '��': 1422, '��': 1423, '��': 1424, '��': 1425, '��': 1426, '��': 1427, '��': 1428, '��': 1429, '��': 1430, '̪': 1431, '��': 1432, '��': 1433, '��': 1434, '˿': 1435, '��': 1436, '��': 1437, '��': 1438, '÷': 1439, '��': 1440, 'غ': 1441, '��': 1442, 'ˤ': 1443, 'ů': 1444, '��': 1445, '��': 1446, '��': 1447, '��': 1448, '��': 1449, '��': 1450, '��': 1451, '��': 1452, '��': 1453, '��': 1454, '��': 1455, '��': 1456, '��': 1457, '��': 1458, '��': 1459, '��': 1460, 'ä': 1461, '��': 1462, '��': 1463, '��': 1464, '��': 1465, '��': 1466, '��': 1467, '��': 1468, '��': 1469, '��': 1470, '̫': 1471, '��': 1472, '��': 1473, '��': 1474, '��': 1475, '��': 1476, 'ϼ': 1477, '��': 1478, '��': 1479, 'Ҧ': 1480, '��': 1481, '��': 1482, '��': 1483, '��': 1484, '��': 1485, '��': 1486, '��': 1487, '��': 1488, '��': 1489, 'ף': 1490, '��': 1491, '��': 1492, '��': 1493, '��': 1494, '��': 1495, 'f': 1496, '��': 1497, '��': 1498, 'Ͽ': 1499, '˧': 1500, '��': 1501, '֦': 1502, 'У': 1503, 'Ȫ': 1504, '��': 1505, '��': 1506, '��': 1507, '��': 1508, 'ƣ': 1509, '��': 1510, '��': 1511, '˵': 1512, '��': 1513, '��': 1514, '��': 1515, 'ƺ': 1516, '��': 1517, '��': 1518, '��': 1519, '��': 1520, '��': 1521, '��': 1522, '��': 1523, '��': 1524, '��': 1525, '͹': 1526, '��': 1527, '��': 1528, '��': 1529, '��': 1530, '��': 1531, '��': 1532, '��': 1533, '��': 1534, '��': 1535, '��': 1536, '��': 1537, '��': 1538, '��': 1539, '��': 1540, '��': 1541, 'ö': 1542, '��': 1543, '��': 1544, '��': 1545, '��': 1546, '��': 1547, '��': 1548, '��': 1549, '̼': 1550, 'ϡ': 1551, '��': 1552, '��': 1553, 'ŧ': 1554, '��': 1555, '��': 1556, 'ϱ': 1557, '�': 1558, '��': 1559, '��': 1560, 'ĥ': 1561, '��': 1562, 'ȩ': 1563, '�': 1564, '��': 1565, '��': 1566, '��': 1567, '��': 1568, '��': 1569, '��': 1570, '��': 1571, '��': 1572, '��': 1573, '̲': 1574, '��': 1575, '��': 1576, '��': 1577, '��': 1578, '��': 1579, '��': 1580, 'Ҳ': 1581, '��': 1582, '��': 1583, '��': 1584, '��': 1585, '��': 1586, '��': 1587, '��': 1588, '��': 1589, '��': 1590, '��': 1591, '��': 1592, 'Ѧ': 1593, '��': 1594, '��': 1595, '��': 1596, '��': 1597, '��': 1598, '��': 1599, '̨': 1600, '��': 1601, '��': 1602, '��': 1603, '��': 1604, '��': 1605, '��': 1606, '��': 1607, '��': 1608, 'Ⱥ': 1609, '��': 1610, '��': 1611, '��': 1612, '��': 1613, '��': 1614, '��': 1615, '��': 1616, '��': 1617, '��': 1618, '��': 1619, '��': 1620, '��': 1621, '��': 1622, '��': 1623, '��': 1624, '��': 1625, '��': 1626, '��': 1627, 'ҫ': 1628, '��': 1629, '��': 1630, '��': 1631, '��': 1632, '��': 1633, '��': 1634, '��': 1635, '��': 1636, '��': 1637, '��': 1638, '��': 1639, '��': 1640, '��': 1641, '��': 1642, 'פ': 1643, '��': 1644, '��': 1645, 'ʿ': 1646, '��': 1647, '˼': 1648, '��': 1649, '��': 1650, '��': 1651, '��': 1652, '��': 1653, '��': 1654, '«': 1655, '��': 1656, '��': 1657, '¬': 1658, '��': 1659, '��': 1660, '��': 1661, 'ij': 1662, '��': 1663, '��': 1664, '��': 1665, 'յ': 1666, '�': 1667, '��': 1668, 'Է': 1669, '��': 1670, '��': 1671, '��': 1672, '��': 1673, 'ս': 1674, '��': 1675, '��': 1676, 'ü': 1677, '��': 1678, '��': 1679, '̷': 1680, '��': 1681, '��': 1682, '��': 1683, '��': 1684, '��': 1685, '��': 1686, '��': 1687, '[': 1688, ']': 1689, '��': 1690, '��': 1691, '��': 1692, '��': 1693, 'е': 1694, '��': 1695, 'Ũ': 1696, '��': 1697, '��': 1698, '��': 1699, '��': 1700, '��': 1701, 'þ': 1702, '��': 1703, '��': 1704, '��': 1705, '��': 1706, 'ֶ': 1707, '��': 1708, 'ϲ': 1709, '��': 1710, '��': 1711, '��': 1712, 'ί': 1713, '��': 1714, '��': 1715, '͢': 1716, '��': 1717, '��': 1718, '��': 1719, 'ʩ': 1720, '��': 1721, '��': 1722, '��': 1723, '��': 1724, '��': 1725, '��': 1726, '��': 1727, '��': 1728, '��': 1729, '��': 1730, '��': 1731, '��': 1732, '֥': 1733, '��': 1734, '��': 1735, '��': 1736, '��': 1737, '��': 1738, '��': 1739, '��': 1740, '��': 1741, '#': 1742, '��': 1743, '��': 1744, '��': 1745, '��': 1746, '��': 1747, '��': 1748, '��': 1749, '��': 1750, 'ե': 1751, '��': 1752, '��': 1753, '��': 1754, '��': 1755, 'Ц': 1756, '��': 1757, '��': 1758, '��': 1759, '��': 1760, '��': 1761, '��': 1762, '��': 1763, ';': 1764, '��': 1765, '��': 1766, '��': 1767, '��': 1768, '��': 1769, '��': 1770, '��': 1771, '��': 1772, '��': 1773, '��': 1774, '��': 1775, '��': 1776, '��': 1777, '�': 1778, '��': 1779, '��': 1780, '��': 1781, '��': 1782, '��': 1783, 'Ш': 1784, '��': 1785, 'DZ': 1786, '��': 1787, '��': 1788, '��': 1789, '��': 1790, '��': 1791, '��': 1792, '��': 1793, '��': 1794, 'ʤ': 1795, '��': 1796, '��': 1797, '��': 1798, 'ѯ': 1799, '��': 1800, '׷': 1801, '��': 1802, '��': 1803, '��': 1804, 'ӡ': 1805, '��': 1806, '��': 1807, '��': 1808, '��': 1809, '��': 1810, '��': 1811, 'Ť': 1812, '��': 1813, '��': 1814, '��': 1815, '��': 1816, '��': 1817, 'Ϫ': 1818, '̶': 1819, '��': 1820, '�': 1821, '��': 1822, 'ʢ': 1823, '��': 1824, '��': 1825, '�': 1826, '��': 1827, '��': 1828, 'о': 1829, '��': 1830, '޹': 1831, '��': 1832, '��': 1833, 'Ѹ': 1834, '��': 1835, '��': 1836, '��': 1837, '��': 1838, '��': 1839, 'x': 1840, '��': 1841, '��': 1842, '��': 1843, 'Ȱ': 1844, '��': 1845, '��': 1846, '��': 1847, '��': 1848, '�': 1849, '��': 1850, '��': 1851, '��': 1852, '��': 1853, '��': 1854, '��': 1855, '��': 1856, '��': 1857, '��': 1858, '��': 1859, '��': 1860, '��': 1861, '��': 1862, '��': 1863, '��': 1864, '��': 1865, '��': 1866, '��': 1867, '��': 1868, '��': 1869, 'Ǯ': 1870, '��': 1871, '��': 1872, 'Ѽ': 1873, '>': 1874, '��': 1875, '��': 1876, '��': 1877, '��': 1878, '��': 1879, 'ڣ': 1880, '��': 1881, '��': 1882, '��': 1883, '��': 1884, '��': 1885, '��': 1886, '��': 1887, '��': 1888, '��': 1889, 'м': 1890, '��': 1891, '��': 1892, '��': 1893, '��': 1894, '��': 1895, '��': 1896, '��': 1897, 'ִ': 1898, '��': 1899, 'ĺ': 1900, '��': 1901, '��': 1902, '��': 1903, '��': 1904, '��': 1905, '��': 1906, '��': 1907, '��': 1908, '��': 1909, '��': 1910, '�': 1911, '��': 1912, '��': 1913, '��': 1914, '��': 1915, '��': 1916, 'ݡ': 1917, 'ݸ': 1918, '��': 1919, '��': 1920, 'ò': 1921, '��': 1922, '��': 1923, '��': 1924, 'İ': 1925, 'Τ': 1926, '��': 1927, 'w': 1928, 'է': 1929, 'ʲ': 1930, '��': 1931, 'α': 1932, '��': 1933, '��': 1934, '��': 1935, '��': 1936, 'ȯ': 1937, '�': 1938, '��': 1939, 'ͥ': 1940, '��': 1941, '��': 1942, '��': 1943, 'Ľ': 1944, '��': 1945, '��': 1946, '��': 1947, '̽': 1948, '��': 1949, '��': 1950, '��': 1951, '��': 1952, '��': 1953, '��': 1954, '��': 1955, '��': 1956, 'Ϩ': 1957, '��': 1958, '��': 1959, '��': 1960, '��': 1961, '��': 1962, '�': 1963, '˰': 1964, '��': 1965, '��': 1966, '��': 1967, '��': 1968, '��': 1969, '��': 1970, 'ʴ': 1971, 'װ': 1972, '��': 1973, '��': 1974, '��': 1975, '��': 1976, '��': 1977, '��': 1978, '��': 1979, '¹': 1980, '��': 1981, '��': 1982, 'ͤ': 1983, '��': 1984, '��': 1985, '��': 1986, '��': 1987, 'Ƚ': 1988, 'ʣ': 1989, '��': 1990, '��': 1991, '��': 1992, '�': 1993, '��': 1994, '��': 1995, '��': 1996, 'Ļ': 1997, '��': 1998, '��': 1999, '��': 2000, '��': 2001, '��': 2002, '��': 2003, '��': 2004, '��': 2005, 'Я': 2006, '��': 2007, '��': 2008, 'Ȩ': 2009, '��': 2010, '��': 2011, '��': 2012, '��': 2013, '̿': 2014, 'ú': 2015, '��': 2016, '��': 2017, '��': 2018, 'ʸ': 2019, '��': 2020, '��': 2021, '��': 2022, '��': 2023, '��': 2024, '��': 2025, '��': 2026, '��': 2027, 'Ө': 2028, '��': 2029, '��': 2030, 'Ϻ': 2031, '��': 2032, '��': 2033, '̸': 2034, '��': 2035, '��': 2036, '��': 2037, '��': 2038, '��': 2039, '��': 2040, '��': 2041, '�': 2042, 'ʻ': 2043, '��': 2044, '��': 2045, '��': 2046, '��': 2047, "'": 2048, '<': 2049, 'Ǭ': 2050, '��': 2051, '��': 2052, '��': 2053, '��': 2054, '��': 2055, '��': 2056, '��': 2057, '��': 2058, '��': 2059, '��': 2060, '��': 2061, '��': 2062, '��': 2063, '��': 2064, '��': 2065, '��': 2066, '��': 2067, '��': 2068, 'ǽ': 2069, '��': 2070, '��': 2071, '��': 2072, '��': 2073, '��': 2074, '�': 2075, '��': 2076, '��': 2077, 'ñ': 2078, '��': 2079, 'Ϥ': 2080, '��': 2081, '��': 2082, '��': 2083, '׾': 2084, '��': 2085, '��': 2086, '�': 2087, '��': 2088, 'ŷ': 2089, '֭': 2090, '��': 2091, '�': 2092, '��': 2093, '��': 2094, '��': 2095, 'Ϳ': 2096, 'ɬ': 2097, '��': 2098, '��': 2099, '��': 2100, '�': 2101, 'ʨ': 2102, 'Ѣ': 2103, '��': 2104, '��': 2105, '��': 2106, '��': 2107, '��': 2108, '��': 2109, '��': 2110, '��': 2111, '��': 2112, 'Ҹ': 2113, '��': 2114, '��': 2115, '��': 2116, '��': 2117, '��': 2118, 'ԣ': 2119, '��': 2120, '֤': 2121, '��': 2122, '��': 2123, '��': 2124, '��': 2125, '��': 2126, 'Ǩ': 2127, '��': 2128, '��': 2129, '۳': 2130, '��': 2131, 'ͭ': 2132, '�': 2133, 'п': 2134, '��': 2135, '��': 2136, 'Ь': 2137, '��': 2138, '��': 2139, '��': 2140, 'Ĭ': 2141, '��': 2142, '��': 2143, '��': 2144, '��': 2145, '��': 2146, '��': 2147, '��': 2148, '��': 2149, '��': 2150, '��': 2151, '��': 2152, '��': 2153, 'ǧ': 2154, '��': 2155, '��': 2156, '��': 2157, '��': 2158, '��': 2159, '��': 2160, '��': 2161, '��': 2162, '��': 2163, '��': 2164, '��': 2165, '��': 2166, '��': 2167, '��': 2168, '��': 2169, '��': 2170, '׳': 2171, 'ز': 2172, '��': 2173, '��': 2174, '��': 2175, '¦': 2176, 'լ': 2177, '�': 2178, '�': 2179, '��': 2180, '��': 2181, '��': 2182, '��': 2183, 'ϯ': 2184, '��': 2185, '��': 2186, '��': 2187, '��': 2188, '��': 2189, '��': 2190, '��': 2191, '�': 2192, 'Ը': 2193, '��': 2194, '��': 2195, '��': 2196, '��': 2197, 'ץ': 2198, 'ȭ': 2199, '˺': 2200, '��': 2201, '��': 2202, '��': 2203, '��': 2204, '��': 2205, '��': 2206, '��': 2207, '��': 2208, 'ɣ': 2209, '�': 2210, '��': 2211, '��': 2212, 'ϴ': 2213, '��': 2214, '��': 2215, '��': 2216, '��': 2217, 'ϩ': 2218, 'ɿ': 2219, 'ү': 2220, 'ˬ': 2221, '��': 2222, '��': 2223, '�': 2224, '��': 2225, 'η': 2226, '��': 2227, '��': 2228, '��': 2229, '˲': 2230, '��': 2231, '��': 2232, '��': 2233, '��': 2234, 'Ʊ': 2235, '��': 2236, '��': 2237, 'Ҥ': 2238, '��': 2239, '��': 2240, '��': 2241, '��': 2242, '��': 2243, '��': 2244, '��': 2245, '��': 2246, '��': 2247, '��': 2248, '��': 2249, '��': 2250, '��': 2251, 'â': 2252, '��': 2253, 'έ': 2254, '��': 2255, '��': 2256, '��': 2257, '��': 2258, '��': 2259, 'ӫ': 2260, '��': 2261, 'ݭ': 2262, '��': 2263, '��': 2264, '��': 2265, '��': 2266, '��': 2267, '�': 2268, '��': 2269, '�': 2270, '��': 2271, '��': 2272, '��': 2273, '��': 2274, '��': 2275, 'ղ': 2276, '��': 2277, '��': 2278, 'á': 2279, '��': 2280, '��': 2281, '��': 2282, '��': 2283, '̤': 2284, '��': 2285, 'ӭ': 2286, '��': 2287, '��': 2288, '��': 2289, '��': 2290, '��': 2291, '��': 2292, '��': 2293, '��': 2294, '��': 2295, '��': 2296, '��': 2297, '��': 2298, '��': 2299, '��': 2300, '��': 2301, '��': 2302, '��': 2303, '��': 2304, '��': 2305, '��': 2306, 'ȧ': 2307, '��': 2308, '��': 2309, '��': 2310, '��': 2311, '&': 2312, '|': 2313, '��': 2314, '��': 2315, '��': 2316, '��': 2317, '��': 2318, '��': 2319, '��': 2320, '��': 2321, '��': 2322, '��': 2323, '��': 2324, '٤': 2325, '��': 2326, '��': 2327, '��': 2328, '��': 2329, '��': 2330, '��': 2331, 'д': 2332, '��': 2333, '��': 2334, '��': 2335, 'ƾ': 2336, '��': 2337, 'ɲ': 2338, '��': 2339, '��': 2340, '��': 2341, '��': 2342, '��': 2343, '��': 2344, '��': 2345, '��': 2346, '��': 2347, '��': 2348, '��': 2349, '̾': 2350, '��': 2351, '��': 2352, '��': 2353, '��': 2354, '��': 2355, '��': 2356, '��': 2357, '��': 2358, 'Ŷ': 2359, '��': 2360, '��': 2361, 'ơ': 2362, '��': 2363, '��': 2364, '��': 2365, '��': 2366, '��': 2367, 'Ȧ': 2368, 'ʥ': 2369, '��': 2370, '��': 2371, '��': 2372, '��': 2373, '��': 2374, '��': 2375, '��': 2376, '��': 2377, '�': 2378, '��': 2379, '�': 2380, 'Ӥ': 2381, 'ɩ': 2382, 'Т': 2383, '��': 2384, '��': 2385, '��': 2386, '��': 2387, 'Ѱ': 2388, '��': 2389, '��': 2390, '�': 2391, '��': 2392, '��': 2393, 'Ρ': 2394, '��': 2395, '��': 2396, 'ϣ': 2397, '��': 2398, '®': 2399, '��': 2400, '��': 2401, '��': 2402, 'æ': 2403, '��': 2404, '��': 2405, '��': 2406, 'ǡ': 2407, '�': 2408, '��': 2409, '��': 2410, '�': 2411, '��': 2412, '��': 2413, 'и': 2414, 'ܲ': 2415, '��': 2416, '��': 2417, '��': 2418, '��': 2419, 'Ͷ': 2420, '��': 2421, '��': 2422, '��': 2423, '��': 2424, '��': 2425, 'ҡ': 2426, 'ײ': 2427, '��': 2428, '��': 2429, '��': 2430, '��': 2431, '��': 2432, '��': 2433, '��': 2434, '��': 2435, '��': 2436, 'ɱ': 2437, '�': 2438, '��': 2439, '��': 2440, '��': 2441, '��': 2442, '��': 2443, '��': 2444, '��': 2445, '��': 2446, '��': 2447, '��': 2448, '��': 2449, '��': 2450, '��': 2451, '��': 2452, '��': 2453, '��': 2454, '��': 2455, 'ع': 2456, '��': 2457, '��': 2458, '��': 2459, '��': 2460, '��': 2461, '��': 2462, 'û': 2463, '��': 2464, '��': 2465, '��': 2466, '��': 2467, 'Ӿ': 2468, '�': 2469, '��': 2470, '��': 2471, '��': 2472, '�': 2473, '��': 2474, '�': 2475, 'μ': 2476, '��': 2477, '��': 2478, '��': 2479, '��': 2480, '��': 2481, '��': 2482, '��': 2483, '��': 2484, '¯': 2485, 'Ȳ': 2486, '��': 2487, '��': 2488, '��': 2489, 'ĵ': 2490, '��': 2491, '��': 2492, '��': 2493, '��': 2494, '��': 2495, '��': 2496, '��': 2497, '��': 2498, '��': 2499, 'ɺ': 2500, '�': 2501, '��': 2502, '��': 2503, 'ƿ': 2504, '߰': 2505, '��': 2506, '��': 2507, '��': 2508, '��': 2509, '��': 2510, '��': 2511, '��': 2512, '��': 2513, '��': 2514, '��': 2515, '�': 2516, '��': 2517, '��': 2518, '��': 2519, '��': 2520, '��': 2521, '��': 2522, '��': 2523, '��': 2524, '��': 2525, '��': 2526, '��': 2527, '��': 2528, '��': 2529, '��': 2530, '��': 2531, '��': 2532, '��': 2533, '�': 2534, '��': 2535, '��': 2536, '��': 2537, '��': 2538, '��': 2539, '��': 2540, '�': 2541, 'в': 2542, '��': 2543, '��': 2544, '��': 2545, '��': 2546, '��': 2547, '��': 2548, '��': 2549, '��': 2550, '��': 2551, '��': 2552, '��': 2553, 'ܿ': 2554, '��': 2555, 'ѿ': 2556, '��': 2557, 'ï': 2558, '��': 2559, '��': 2560, '��': 2561, '��': 2562, '��': 2563, '��': 2564, '��': 2565, '��': 2566, 'ε': 2567, '��': 2568, '��': 2569, '��': 2570, '��': 2571, '��': 2572, '��': 2573, '��': 2574, '��': 2575, '��': 2576, '�': 2577, '��': 2578, '��': 2579, '��': 2580, '��': 2581, 'ԥ': 2582, '��': 2583, '��': 2584, '��': 2585, 'ó': 2586, '��': 2587, '��': 2588, '��': 2589, 'շ': 2590, '��': 2591, '��': 2592, '��': 2593, '��': 2594, '��': 2595, '��': 2596, 'ۨ': 2597, '۪': 2598, '��': 2599, '��': 2600, '��': 2601, '��': 2602, '��': 2603, '��': 2604, 'Ǧ': 2605, '��': 2606, '��': 2607, '�': 2608, '׶': 2609, '��': 2610, '��': 2611, '��': 2612, '��': 2613, '��': 2614, '��': 2615, '��': 2616, '��': 2617, '��': 2618, '��': 2619, 'Ʈ': 2620, '��': 2621, '��': 2622, '��': 2623, '��': 2624, 'ɧ': 2625, '��': 2626, '��': 2627, '��': 2628, '��': 2629, '��': 2630, '��': 2631, '��': 2632, '��': 2633, '��': 2634, '��': 2635, '��': 2636, '��': 2637, '��': 2638, '��': 2639, '��': 2640, '��': 2641, '��': 2642, '��': 2643, '��': 2644, '��': 2645, '��': 2646, '��': 2647, 'Ԣ': 2648, '�': 2649, '��': 2650, '��': 2651, '��': 2652, '��': 2653, '�': 2654, 'ɼ': 2655, '�': 2656, '��': 2657, '��': 2658, '��': 2659, '��': 2660, '��': 2661, '��': 2662, '��': 2663, '��': 2664, '��': 2665, '��': 2666, '��': 2667, '��': 2668, '��': 2669, 'ޱ': 2670, '��': 2671, '��': 2672, 'ң': 2673, '��': 2674, '��': 2675, '��': 2676}
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        # chars [6, 305, 110, 2, 2, 2, 35, 2, 2, 55, 2, 2, 51, 30, 12, 138, 205, 39, 37, 204, 188, 4, 1287, 78, 78, 10, 167, 4, 365, 35, 240, 4, 2, 2, 175, 48, 12, 51, 110, 4, 2, 2, 2, 2, 35, 2, 2, 55, 2, 2, 51, 2, 2, 43, 2, 2, 29, 30, 12, 51, 110, 4, 2, 2, 2, 2, 35, 2, 2, 55, 2, 2, 51, 2, 2, 43, 2, 2, 29, 343, 68, 12, 2, 2, 122, 5]
        segs = get_seg_features("".join(string))  #���Ӱ�sbie�ִʱ�ע
        if train:
            #tag_to_id �̶�{'O': 0, 'I-TES': 1, 'I-DIS': 2, 'I-SGN': 3, 'B-TES': 4, 'E-TES': 5, 'B-REG': 6, 'E-REG': 7, 'B-SYM': 8, 'E-SYM': 9, 'B-SGN': 10, 'E-SGN': 11, 'I-SYM': 12, 'B-DIS': 13, 'E-DIS': 14, 'B-DRU': 15, 'E-DRU': 16, 'B-ORG': 17, 'E-ORG': 18, 'I-DRU': 19, 'I-PT': 20, 'I-REG': 21, 'I-ORG': 22, 'B-PT': 23, 'E-PT': 24, 'S-DRU': 25, 'B-DEG': 26, 'E-DEG': 27, 'S-REG': 28, 'S-SYM': 29, 'S-DIS': 30, 'B-FW': 31, 'E-FW': 32, 'B-PRE': 33, 'E-PRE': 34, 'I-SUR': 35, 'I-FW': 36, 'I-PRE': 37, 'B-CL': 38, 'E-CL': 39, 'S-ORG': 40, 'B-SUR': 41, 'E-SUR': 42, 'B-Dur': 43, 'E-Dur': 44, 'B-PSB': 45, 'E-PSB': 46, 'I-CL': 47, 'I-DEG': 48, 'I-PSB': 49, 'S-TES': 50}
            tags = [tag_to_id[w[-1]] for w in s]  #��ע����ʵ��ִ�id
            # tags [0, 43, 44, 0, 0, 0, 0, 0, 0, 0, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 16, 0, 0, 0, 0, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]
        else:
            tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])
        #string ['��', 'Ժ', '��', '��', '��', '��', '��', '0', '0', '��', '��', '��', 'ͻ', '��', '��', '��', '֫', '��', '��', '��', '0', '0', 'С', 'ʱ', 'Ϊ', '��', '��', '��', '��', '��', '��', 'Ѫ', 'ѹ', '��', 'ʷ', '0', '0', '��', '��', '��', 'Ѫ', 'ѹ', '��', '��', '0', '0', '0', '/', '��']
        #chars [48, 12, 76, 96, 4, 365, 3, 2, 2, 175, 3, 152, 127, 46, 24, 9, 32, 22, 6, 17, 2, 2, 115, 43, 171, 104, 270, 25, 292, 291, 62, 16, 57, 11, 15, 2, 2, 330, 35, 3, 16, 57, 564, 62, 2, 2, 2, 38, 5]
        #segs[1, 3, 1, 3, 0, 0, 0, 1, 3, 0, 0, 0, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 0, 0, 1, 3, 1, 2, 3, 1, 3, 1, 3, 1, 3, 0, 1, 3, 1, 3, 1, 2, 3, 0, 0]
        #tags [0, 0, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 6, 21, 21, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0, 23, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0]

    return data
Ejemplo n.º 27
0
def prepare_dataset(sentences,
                    char_to_id,
                    tag_to_id,
                    lower=False,
                    train=True,
                    isenglish=False):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    #none_index = tag_to_id["O"]
    # for k in tag_to_id:
    #     print(k)
    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        string = [w[0] for w in s]
        #tag = [w[-1] for w in s]
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        if isenglish:
            segs = [0] * len(chars)
        else:
            segs = get_seg_features("".join(string))
        if train:
            #print (tag)
            tags = [tag_to_id[w[-1]] for w in s]
        # else:
        #     tags = [none_index for _ in chars]
        data.append([string, chars, segs, tags])

    return data
def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, test=False):
    
    """
    把文本型的样本和标签,转化为index,便于输入模型
    需要在每个样本和标签前后加<start>和<end>
    """

    def f(x): return x.lower() if lower else x
    
    data = []
    for s in sentences:
        
        chars = [w[0] for w in s]
        tags = [w[-1] for w in s]
        
        """ 句子转化为index """
        chars_idx = [char_to_id[f(c) if f(c) in char_to_id else '<unk>'] for c in chars]
        
        """ 对句子分词,构造词的长度特征 """
        segs_idx = get_seg_features("".join(chars))
        
        """ 每个样本前后加<start>和<end> """
        chars_idx = [char_to_id["<start>"]] + chars_idx + [char_to_id["<end>"]]
        segs_idx = [0] + segs_idx + [0]        
        
        """ 把标签转化为index, 标签前后加<start>和<end> """
        tags = ["<start>"] + tags + ["<end>"]
        if not test:
            tags_idx =  [tag_to_id[t] for t in tags]
            
        else:
            tags_idx = [tag_to_id["<pad>"] for _ in tags]
            
        assert len(chars_idx) == len(segs_idx) == len(tags_idx)
        data.append([chars_idx, segs_idx, tags_idx])

    return data
Ejemplo n.º 29
0
def prepare_dataset(sentences,
                    char_to_id,
                    tag_to_id,
                    mark_to_id,
                    entropy_dict,
                    lower=False,
                    train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    none_index = tag_to_id["O"]  # 目前train始终未true,none_index并未作用

    def f(x):
        return x.lower() if lower else x

    data = []
    dis_dict_map = {"O": 0, "B": 1, "I": 2, "E": 3, "S": 4}
    loc_dict_map = {"O": 0, "B": 1, "I": 2, "E": 3, "S": 4}
    res_dict_map = {"O": 0, "B": 1, "I": 2, "E": 3, "S": 4}
    for s in sentences:
        string = ' '.join([w[0] for w in s])
        chars = ' '.join([
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ])
        segs = ' '.join(get_seg_features("".join(string)))
        marks = ' '.join(
            [mark_to_id[w[1] if w[1] in mark_to_id else '<UNK>'] for w in s])
        complete_dis_dicts = ' '.join([dis_dict_map[w[1]] for w in s])
        partial_dis_dicts = ' '.join([dis_dict_map[w[2]] for w in s])
        pinyin_dis_dicts = ' '.join([dis_dict_map[w[3]] for w in s])
        complete_loc_dicts = ' '.join([loc_dict_map[w[4]] for w in s])
        partial_loc_dicts = ' '.join([loc_dict_map[w[5]] for w in s])
        pinyin_loc_dicts = ' '.join([loc_dict_map[w[6]] for w in s])
        complete_res_dicts = ' '.join([res_dict_map[w[7]] for w in s])
        partial_res_dicts = ' '.join([res_dict_map[w[8]] for w in s])
        pinyin_res_dicts = ' '.join([res_dict_map[w[9]] for w in s])
        # 左右熵计算(不存在时存储多少有待考虑,目前用均值填充)
        left_entropy = [
            entropy_dict[w][0] if w in entropy_dict else 0 for w in string
        ]
        # 计算非0的均值可能会出bug:一个query仅有一个字非0,其他均为0时会出现除0;这里用整体均值
        # left_entropy_average = np.mean(np.array(left_entropy)[np.array(left_entropy) != 0])
        left_entropy_average = np.mean(np.array(left_entropy))
        left_entropy = [
            elem if elem != 0 else left_entropy_average
            for elem in left_entropy
        ]
        if max(left_entropy) != min(left_entropy):
            left_entropy = [[(i - min(left_entropy)) /
                             (max(left_entropy) - min(left_entropy))]
                            for i in left_entropy]
        else:
            left_entropy = [[i] for i in left_entropy]

        left_entropy = ' '.join(left_entropy)

        right_entropy = [
            entropy_dict[w][1] if w in entropy_dict else 0 for w in string
        ]
        # right_entropy_average = np.mean(np.array(right_entropy)[np.array(right_entropy) != 0])
        right_entropy_average = np.mean(np.array(right_entropy))
        right_entropy = [
            elem if elem != 0 else right_entropy_average
            for elem in right_entropy
        ]
        if max(right_entropy) != min(right_entropy):
            right_entropy = [[(i - min(right_entropy)) /
                              (max(right_entropy) - min(right_entropy))]
                             for i in right_entropy]
        else:
            right_entropy = [[i] for i in right_entropy]
        right_entropy = ' '.join(right_entropy)

        if train:
            tags = [tag_to_id[w[-1]] for w in s]
        else:
            tags = [none_index for _ in chars]
        tags = ' '.join(tags)

        data.append([
            string, chars, segs, marks, complete_dis_dicts, partial_dis_dicts,
            pinyin_dis_dicts, complete_loc_dicts, partial_loc_dicts,
            pinyin_loc_dicts, complete_res_dicts, partial_res_dicts,
            pinyin_res_dicts, left_entropy, right_entropy, tags
        ])

    return data