Esempio n. 1
0
File: query.py Progetto: xjrelc/-
def build_query(data_path, w2v_path, vocab_path, k):
    '''
    构建查询
    
    Args:
        data_path str 查询文件路径
        model_path str 词向量模型路径
        vocab_path str 词典路径
        k int 返回前k个相近词
    Returns:
        query_list list 已扩展的查询列表
    '''
    # 载入词向量模型,词典模型
    w2v_model = load_model(w2v_path)
    vocab = pickle_load(vocab_path)
    query_list = []
    # 解析xml文档
    qurey_dict = {'disease': [], 'gene': [], 'demographic': [], 'other': []}
    query_dict = xml_parse(data_path, qurey_dict, 1)
    disease_field_list = query_dict['disease']
    gene_field_list = query_dict['gene']
    demographic_field_list = query_dict['demographic']
    other_field_list = query_dict['other']
    del query_dict
    # 遍历查询
    for i in range(len(disease_field_list)):
        query_tmp_list = []
        # 获取一条查询的查询词
        disease_field_list[i] = preprocess(disease_field_list[i])
        disease_list = disease_field_list[i].split(' ')
        gene_field_list[i] = preprocess(gene_field_list[i])
        gene_list = gene_field_list[i].split(' ')
        other_list = preprocess(other_field_list[i])
        other_list = other_field_list[i].split(' ')
        demographic_list = demographic_split(demographic_field_list[i])
        # 对原始查询进行词性还原与去停用词操作
        disease_clean_list = clean_data(disease_list)
        gene_clean_list = clean_data(gene_list)
        demographic_clean_list = clean_data(demographic_list)
        other_clean_list = clean_data(other_list)
        # 查询扩展(含词干还原和去停用词操作)
        query_tmp_list.append(
            query_extension(disease_clean_list, w2v_model, vocab, k))
        query_tmp_list.append(
            query_extension(gene_clean_list, w2v_model, vocab, k))
        query_tmp_list.append(
            query_extension(other_clean_list, w2v_model, vocab, k))
        tmp_dict = {}
        for tmp in demographic_clean_list:
            tmp_dict[tmp] = []
        query_tmp_list.append(tmp_dict)
        query_list.append(query_tmp_list)
    return query_list
        word2vec_dataset = Word2VecDataset(data, ARGS)
        data_loader = DataLoader(word2vec_dataset,
                                 batch_size=ARGS.batch_size,
                                 shuffle=True,
                                 num_workers=2)

        print("Initializing model ...")
        model = Word2Vec(vocab, ARGS.embed_dim).to(ARGS.device)

        print("Train...")
        train(ARGS, data_loader, model)

    elif ARGS.mode in ["ret_words", "eval"]:
        model_path = os.path.join("models",
                                  f"ww_{ARGS.ww_size}_{ARGS.freq_thresh}")
        model = load_model(ARGS, model_path)
        print(model)

        print(
            f"Load docs: filtered_docs/filtered_docs_{ARGS.freq_thresh}.pkl..."
        )
        docs_by_id = data_processing.load_pickle(
            f"filtered_docs/filtered_docs_{ARGS.freq_thresh}.pkl")
        retriever = W2VRetrieval(ARGS, model, docs_by_id)

        if ARGS.mode == "ret_words":
            print(f"Search query: {ARGS.query}")
            if ARGS.eval_mode == "words":
                top_words = retriever.match_query_against_words(ARGS.query)
                print(f"Top {ARGS.top_n} words:", top_words)
Esempio n. 3
0
from data_loader import load_data
from word2vec import load_model
import random
import pickle
import os

w2v_model = load_model()


class MyData(object):
    def __init__(self, path):
        self.path = path
        self.all_data = self.load_data()
        self.sample_num = len(self.all_data)

    def load_data(self):
        if os.path.isfile('data.pkl'):
            with open('./data.pkl', 'rb') as f:
                results = pickle.load(f)
        else:
            data = load_data(self.path)
            results = []
            for each in data:
                text = each[-2] + ' ' + each[-3]
                label = int(each[-8])
                matrix = [
                    w2v_model[each] for each in text.split(' ')
                    if each in w2v_model.wv.vocab
                ]
                results.append([matrix, label])
            with open('./data.pkl', 'wb') as f:
import torch.nn as nn
import torch.optim as optim

from torch.utils import data

from word2vec import load_model
from extract_func import process_file
from config import key_libs, SEED

random.seed(SEED)
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
log_size = 10
clip = 1
n_epoch = 50
n_tolerance = 2
idx2word, vectors, wv_model = load_model()
word2idx = {word: i for i, word in enumerate(idx2word)}


def create_dataset():
    path = '/home/gezhang/data/jupyter/target'
    files = [f for f in os.listdir(path) if f.endswith('.py')]

    all_funcs = []
    for file in tqdm(files):
        try:
            funcs, _ = process_file(os.path.join(path, file))
            all_funcs.append(funcs)
        except:
            pass
Esempio n. 5
0
                 'depth': cfg.NN_DEPTH,
                 'corpus_name': cfg.NN_CORPUS_NAME}
    nn_model = atn.build_nn_model(nn_params)
    return nn_model


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Trading arguments')
    # parser.add_argument('-a', '--nn_architect_file', help='Neural network model file name')
    parser.add_argument('-w', '--nn_weight_file', help='Neural network model file name')
    parser.add_argument('-v', '--w2v_file', help='word2vec model file name')
    args = parser.parse_args()

    # if args.nn_architect_file is None or args.nn_weight_file is None or args.w2v_file is None:
    if args.nn_weight_file is None or args.w2v_file is None:
        print('Please specify model files.')
    else:
        print('Loading models...')
        ts = time.time()
        # nn_model = atn.load_model(args.nn_architect_file, args.nn_weight_file)
        w2v_model = w2v.load_model(args.w2v_file)
        nn_model = reconstruct_nn(w2v_model)
        atn.load_model(nn_model, args.nn_weight_file)
        print('Models loaded ({:.1f} minutes)'.format((time.time() - ts) / 60))
        while True:
            source = raw_input("EN> ")
            if source == '__EOT__':
                break
            target = nn_model.predict([cp.sentence_to_one_hot(source)])
            print('CH> ' + cp.one_hot_to_sentence(w2v_model, target))
Esempio n. 6
0
                dim) + '.model'
            word2index = word2vec.run_word2vec(tokens,
                                               embedding_size=dim,
                                               algo=algo,
                                               model_path=model_path)

if do_indexing == True:

    print('Start indexing')
    import word2vec

    for exp in EXPERIMENTS:
        print('Indexing:', exp)
        model_path = PATH_MODELS + 'model_' + exp['ALGO'] + '_' + str(
            exp['DIM']) + '.model'
        model = word2vec.load_model(model_path)
        tokens = pd.read_hdf(PATH_SELECTED + 'selected.h5')
        name = exp['NAME']
        word2vec.encode_tokens(tokens,
                               model,
                               exp['DIM'],
                               max_words=exp['VOCAB'],
                               name=name,
                               avg_vec=True,
                               store=True,
                               path=PATH_ENCODING)

if do_plots == True:

    print('Plotting')
    import matplotlib
Esempio n. 7
0
    nn_model = atn.build_nn_model(nn_params)
    return nn_model


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Trading arguments')
    # parser.add_argument('-a', '--nn_architect_file', help='Neural network model file name')
    parser.add_argument('-w',
                        '--nn_weight_file',
                        help='Neural network model file name')
    parser.add_argument('-v', '--w2v_file', help='word2vec model file name')
    args = parser.parse_args()

    # if args.nn_architect_file is None or args.nn_weight_file is None or args.w2v_file is None:
    if args.nn_weight_file is None or args.w2v_file is None:
        print('Please specify model files.')
    else:
        print('Loading models...')
        ts = time.time()
        # nn_model = atn.load_model(args.nn_architect_file, args.nn_weight_file)
        w2v_model = w2v.load_model(args.w2v_file)
        nn_model = reconstruct_nn(w2v_model)
        atn.load_model(nn_model, args.nn_weight_file)
        print('Models loaded ({:.1f} minutes)'.format((time.time() - ts) / 60))
        while True:
            source = raw_input("EN> ")
            if source == '__EOT__':
                break
            target = nn_model.predict([cp.sentence_to_one_hot(source)])
            print('CH> ' + cp.one_hot_to_sentence(w2v_model, target))
Esempio n. 8
0
def reload_word2vec():
    global WORD2VEC_MODEL
    log("Starting download and load word2vec pretrained model.....")
    WORD2VEC_MODEL = w2v.load_model(WORD2VEC_MODEL_NAME)
    log("Finished word2vec loading.")