コード例 #1
0
ファイル: data_utils.py プロジェクト: wslc1314/atec_nlp_sim
def createLocalWCDict(trainFile,
                      min_count_w=2,
                      min_count_c=2,
                      global_dict_path="data/atec/training-2-2.json"):
    """
    根据训练数据的不同生成不同的动态序号-词/字、词/字-序号字典
    """
    global_dict = loadDict(global_dict_path)
    global_w_v2i = global_dict["word"]["v2i"]
    global_c_v2i = global_dict["char"]["v2i"]
    data = read_cut_file(trainFile, with_label=True)
    sentences = data["sent1w"] + data["sent2w"]
    sentences_c = data["sent1c"] + data["sent2c"]
    savePath = os.path.join(os.path.dirname(trainFile),
                            os.path.basename(trainFile).split(".")[0])
    words, chars = {}, {}
    for sentence in sentences:
        for word in sentence:
            try:
                words[word] += 1
            except:
                words[word] = 1
    for sentence in sentences_c:
        for char in sentence:
            try:
                chars[char] += 1
            except:
                chars[char] = 1
    print("Size for text words: ", len(words.keys()))
    print("Size for global words: ", len(global_w_v2i.keys()))
    print("Size for text chars: ", len(chars.keys()))
    print("Size for global chars: ", len(global_c_v2i.keys()))
    vocab = [w for w in words.keys() if words[w] >= min_count_w]
    vocab_c = [c for c in chars.keys() if chars[c] >= min_count_c]
    vocab = ['<pad>'] + ['<unk>'] + vocab
    vocab_c = ['<pad>'] + ['<unk>'] + vocab_c
    v2i, i2v = {}, {}
    for word in vocab:
        id = global_w_v2i[word]
        v2i[word] = id
        i2v[id] = word
    print("id for <pad>: ", v2i['<pad>'])
    print("id for <unk>: ", v2i['<unk>'])
    print("total vocab size: ", len(v2i.keys()))
    w_dict = {"v2i": v2i, "i2v": i2v}
    v2i, i2v = {}, {}
    for word in vocab_c:
        id = global_c_v2i[word]
        v2i[word] = id
        i2v[id] = word
    print("id for <pad>: ", v2i['<pad>'])
    print("id for <unk>: ", v2i['<unk>'])
    print("total vocab size: ", len(v2i.keys()))
    c_dict = {"v2i": v2i, "i2v": i2v}
    d = {"word": w_dict, "char": c_dict}
    saveDict(
        d,
        savePath + "-" + str(min_count_w) + "-" + str(min_count_c) + ".json")
    return d
コード例 #2
0
        id = global_w_v2i[word]
        v2i[word] = id
        i2v[id] = word
    print("id for <pad>: ",v2i['<pad>'])
    print("id for <unk>: ",v2i['<unk>'])
    print("total vocab size: ",len(v2i.keys()))
    w_dict={"v2i":v2i,"i2v":i2v}
    v2i,i2v = {},{}
    for word in vocab_c:
        id = global_c_v2i[word]
        v2i[word] = id
        i2v[id] = word
    print("id for <pad>: ",v2i['<pad>'])
    print("id for <unk>: ",v2i['<unk>'])
    print("total vocab size: ",len(v2i.keys()))
    c_dict={"v2i":v2i,"i2v":i2v}
    d={"word":w_dict,"char":c_dict}
    saveDict(d, savePath + "-" + str(min_count_w) + "-" + str(min_count_c) + ".json")
    return d


if __name__=="__main__":
    # createGlobalWCDict("../data/atec/training.csv",2,2)
    # createLocalWCDict("../data/atec/10/train0.csv",2,2,"../data/atec/training-2-2.json")

    global_dict=loadDict("../data/atec/training-2-2.json")
    c_len={}
    for w in global_dict["word"]["v2i"].keys():
        c_len[w]=len(w)
    print(sorted(c_len.items(),key=lambda x:x[1],reverse=True)) # 9、7、6、6、6、6、5
コード例 #3
0
def load_global_embedding_matrix(wv_path_w,
                                 wv_path_c,
                                 global_dict_path="data/atec/training-2-2.json"
                                 ):
    wv_path_list = [wv_path_w, wv_path_c]
    try:
        wv_type_w = wv_path_w.split("/")[-2]
    except AttributeError:
        wv_type_w = None
    assert wv_type_w in ["glove", "word2vec", "fasttext", None]
    try:
        wv_type_c = wv_path_c.split("/")[-2]
    except AttributeError:
        wv_type_c = None
    assert wv_type_c in ["glove", "word2vec", "fasttext", None]
    wv_type_list = [wv_type_w, wv_type_c]
    try:
        wv_name_w = wv_path_w.split("/")[-1].split(".")[0]
    except AttributeError:
        wv_name_w = None
    try:
        wv_name_c = wv_path_c.split("/")[-1].split(".")[0]
    except AttributeError:
        wv_name_c = None
    wv_name_list = [wv_name_w, wv_name_c]

    embeddings_list = []
    for i, wv_level in enumerate(["word", "char"]):
        wv_path = wv_path_list[i]
        if isinstance(wv_path, str):
            wv_type = wv_type_list[i]
            assert wv_type in ["glove", "word2vec", "fasttext"]
            wv_name = wv_name_list[i].replace("wc", wv_level)
            embed_path = global_dict_path.replace(
                ".json", "_" + wv_type + '_' + wv_name + ".npy")
            embed_path_oov = global_dict_path.replace(
                ".json", "_" + wv_type + '_' + wv_name + ".oov.npy")
            if os.path.exists(embed_path):
                embeddings = np.load(embed_path)
                oov_mask = np.load(embed_path_oov)
                assert embeddings.shape[0] == oov_mask.shape[0]
            else:
                oov_mask = []
                i2v = loadDict(global_dict_path)[wv_level]["i2v"]
                vocab_size = len(i2v)
                embedding_size = int(wv_name.split("-")[-1])
                embeddings = np.random.uniform(low=-0.1,
                                               high=0.1,
                                               size=(vocab_size,
                                                     embedding_size))
                if wv_type == "word2vec":
                    model = models.Word2Vec.load(wv_path)
                else:
                    model = models.KeyedVectors.load_word2vec_format(
                        wv_path, binary=False)
                n_oov = 0
                for i in range(vocab_size):
                    word = i2v[str(i)]
                    try:
                        embeddings[i] = model[word]
                        oov_mask.append(0)
                    except:
                        n_oov += 1
                        print("Not in wv: id: %d, vocab: %s" % (i, word))
                        oov_mask.append(1)
                print("Size for oov: %d!" % n_oov)
                np.save(embed_path, embeddings)
                oov_mask = np.asarray(oov_mask, dtype=np.int).reshape(
                    (vocab_size, 1))
                np.save(embed_path_oov, oov_mask)
            embeddings_list.append((embeddings, oov_mask))
        elif isinstance(wv_path, int):
            i2v = loadDict(global_dict_path)[wv_level]["i2v"]
            vocab_size = len(i2v)
            embedding_size = int(wv_path)
            embeddings = np.random.uniform(low=-0.1,
                                           high=0.1,
                                           size=(vocab_size, embedding_size))
            oov_mask = [0] * vocab_size
            oov_mask = np.asarray(oov_mask, dtype=np.int).reshape(
                (vocab_size, 1))
            embeddings_list.append((embeddings, oov_mask))
        else:
            print("Unsupported type for wv_path!")
    return embeddings_list[0], embeddings_list[1]