def feature_mapping(sentences, features):
    """
    创建特征和index映射关系
    :param sentences: list of list of tuple, [[(words11, features11, ..., tag11), ...], [(word21, feature21, ..., tag21), ...], ...]
    :param features: string, 特征的列index, 以逗号分隔
    :return: 
        feature_to_id: dict
        id_to_feature: dict
    """
    dico = OrderedDict()
    feature_to_id = OrderedDict()
    id_to_feature = OrderedDict()

    features_list = features.split(",")
    for feature_i in features_list:
        if feature_i == "0":
            continue
        cur_feature = [[t[int(feature_i)] for t in s] for s in sentences]
        cur_dico = create_dico(cur_feature)
        print("%sth feature found %i unique features" %
              (feature_i, len(cur_dico)))
        cur_dico["<UNK>"] = 10000000
        cur_feature_to_id, cur_id_to_feature = create_mapping(cur_dico)

        dico[feature_i] = cur_dico
        feature_to_id[feature_i] = cur_feature_to_id
        id_to_feature[feature_i] = cur_id_to_feature

    return dico, feature_to_id, id_to_feature
Esempio n. 2
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[char[-1] for char in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
Esempio n. 3
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """

    f = open(os.path.join('configs', 'tag_to_id.txt'), 'w', encoding='utf8')
    f1 = open(os.path.join('configs', 'id_to_tag.txt'), 'w', encoding='utf8')
    f2 = open(os.path.join('configs', 'indent_to_id.txt'),
              'w',
              encoding='utf8')
    f3 = open(os.path.join('configs', 'id_to_indent.txt'),
              'w',
              encoding='utf8')

    tags = []
    intents = []
    for s in sentences:
        ts = []
        ints = []
        for char in s:
            tag = char[1]
            intent = char[-1]
            ts.append(tag)
            ints.append(intent)
        tags.append(ts)
        intents.append(ints)

    dico_tags = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico_tags)

    dico_intents = create_dico(intents)
    intent_to_id, id_to_intent = create_mapping(dico_intents)

    for k, v in tag_to_id.items():
        f.write(k + ":" + str(v) + "\n")
    for k, v in id_to_tag.items():
        f1.write(str(k) + ":" + str(v) + "\n")

    for k, v in intent_to_id.items():
        f2.write(k + ":" + str(v) + "\n")
    for k, v in id_to_intent.items():
        f3.write(str(k) + ":" + str(v) + "\n")

    return tag_to_id, id_to_tag, intent_to_id, id_to_intent
Esempio n. 4
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char