Ejemplo n.º 1
0
def build_vocab(toka_path_):
    tk = Tokenizer(lower=True, filters='')
    train_text = train_text_dic.values()
    kb_data_text = kb_all_text_dic.values()
    train_text = list(set(train_text))
    kb_data_text = list(set(kb_data_text))
    full_texts = train_text + kb_data_text
    tk.fit_on_texts(full_texts)
    pickle.dump(tk, open(toka_path_, 'wb'))
    print("build_vocab")
    return tk
Ejemplo n.º 2
0
def build_vocab(toka_path_, toka_type_path_):
    tk = Tokenizer(lower=True, filters='')
    tk_type = Tokenizer(lower=True, filters='')
    train_text = train_text_dic.values()
    kb_data_text = kb_all_text_dic.values()
    train_text = list(set(train_text))
    kb_data_text = list(set(kb_data_text))
    full_texts = train_text + kb_data_text
    full_types = [kb_data_s["type"] for kb_data_s in id2entity.values()]
    tk.fit_on_texts(full_texts)
    tk_type.fit_on_texts(full_types)

    pickle.dump(tk, open(toka_path_, 'wb'))
    pickle.dump(tk_type, open(toka_type_path_, 'wb'))
    print("build_vocab")
    return tk, tk_type
Ejemplo n.º 3
0
def build_vocab():
    full_types = []
    full_predicates = []
    for kb_data_s in id2entity.values():
        types = kb_data_s["type"]
        full_types.extend(types)
        full_predicates.extend([data["predicate"] for data in kb_data_s["data"]])
    tk_type.fit_on_texts(full_types)

    tk_predicate.fit_on_sequences(full_predicates)

    train_text = train_text_dic.values()
    kb_data_text = kb_text_dic.values()
    train_text = list(set(train_text))
    kb_data_text = list(set(kb_data_text))
    full_texts = train_text + kb_data_text
    tk.fit_on_texts(full_texts)

    pickle.dump(tk, open(toka_path, 'wb'))
    pickle.dump(tk_type, open(toka_type_path, 'wb'))
    pickle.dump(tk_predicate, open(toka_predicate_path, 'wb'))
    print("build_vocab")