def build_query(data_path, w2v_path, vocab_path, k): ''' 构建查询 Args: data_path str 查询文件路径 model_path str 词向量模型路径 vocab_path str 词典路径 k int 返回前k个相近词 Returns: query_list list 已扩展的查询列表 ''' # 载入词向量模型,词典模型 w2v_model = load_model(w2v_path) vocab = pickle_load(vocab_path) query_list = [] # 解析xml文档 qurey_dict = {'disease': [], 'gene': [], 'demographic': [], 'other': []} query_dict = xml_parse(data_path, qurey_dict, 1) disease_field_list = query_dict['disease'] gene_field_list = query_dict['gene'] demographic_field_list = query_dict['demographic'] other_field_list = query_dict['other'] del query_dict # 遍历查询 for i in range(len(disease_field_list)): query_tmp_list = [] # 获取一条查询的查询词 disease_field_list[i] = preprocess(disease_field_list[i]) disease_list = disease_field_list[i].split(' ') gene_field_list[i] = preprocess(gene_field_list[i]) gene_list = gene_field_list[i].split(' ') other_list = preprocess(other_field_list[i]) other_list = other_field_list[i].split(' ') demographic_list = demographic_split(demographic_field_list[i]) # 对原始查询进行词性还原与去停用词操作 disease_clean_list = clean_data(disease_list) gene_clean_list = clean_data(gene_list) demographic_clean_list = clean_data(demographic_list) other_clean_list = clean_data(other_list) # 查询扩展(含词干还原和去停用词操作) query_tmp_list.append( query_extension(disease_clean_list, w2v_model, vocab, k)) query_tmp_list.append( query_extension(gene_clean_list, w2v_model, vocab, k)) query_tmp_list.append( query_extension(other_clean_list, w2v_model, vocab, k)) tmp_dict = {} for tmp in demographic_clean_list: tmp_dict[tmp] = [] query_tmp_list.append(tmp_dict) query_list.append(query_tmp_list) return query_list
word2vec_dataset = Word2VecDataset(data, ARGS) data_loader = DataLoader(word2vec_dataset, batch_size=ARGS.batch_size, shuffle=True, num_workers=2) print("Initializing model ...") model = Word2Vec(vocab, ARGS.embed_dim).to(ARGS.device) print("Train...") train(ARGS, data_loader, model) elif ARGS.mode in ["ret_words", "eval"]: model_path = os.path.join("models", f"ww_{ARGS.ww_size}_{ARGS.freq_thresh}") model = load_model(ARGS, model_path) print(model) print( f"Load docs: filtered_docs/filtered_docs_{ARGS.freq_thresh}.pkl..." ) docs_by_id = data_processing.load_pickle( f"filtered_docs/filtered_docs_{ARGS.freq_thresh}.pkl") retriever = W2VRetrieval(ARGS, model, docs_by_id) if ARGS.mode == "ret_words": print(f"Search query: {ARGS.query}") if ARGS.eval_mode == "words": top_words = retriever.match_query_against_words(ARGS.query) print(f"Top {ARGS.top_n} words:", top_words)
from data_loader import load_data from word2vec import load_model import random import pickle import os w2v_model = load_model() class MyData(object): def __init__(self, path): self.path = path self.all_data = self.load_data() self.sample_num = len(self.all_data) def load_data(self): if os.path.isfile('data.pkl'): with open('./data.pkl', 'rb') as f: results = pickle.load(f) else: data = load_data(self.path) results = [] for each in data: text = each[-2] + ' ' + each[-3] label = int(each[-8]) matrix = [ w2v_model[each] for each in text.split(' ') if each in w2v_model.wv.vocab ] results.append([matrix, label]) with open('./data.pkl', 'wb') as f:
import torch.nn as nn import torch.optim as optim from torch.utils import data from word2vec import load_model from extract_func import process_file from config import key_libs, SEED random.seed(SEED) os.environ["CUDA_VISIBLE_DEVICES"] = "1" log_size = 10 clip = 1 n_epoch = 50 n_tolerance = 2 idx2word, vectors, wv_model = load_model() word2idx = {word: i for i, word in enumerate(idx2word)} def create_dataset(): path = '/home/gezhang/data/jupyter/target' files = [f for f in os.listdir(path) if f.endswith('.py')] all_funcs = [] for file in tqdm(files): try: funcs, _ = process_file(os.path.join(path, file)) all_funcs.append(funcs) except: pass
'depth': cfg.NN_DEPTH, 'corpus_name': cfg.NN_CORPUS_NAME} nn_model = atn.build_nn_model(nn_params) return nn_model if __name__ == '__main__': parser = argparse.ArgumentParser(description='Trading arguments') # parser.add_argument('-a', '--nn_architect_file', help='Neural network model file name') parser.add_argument('-w', '--nn_weight_file', help='Neural network model file name') parser.add_argument('-v', '--w2v_file', help='word2vec model file name') args = parser.parse_args() # if args.nn_architect_file is None or args.nn_weight_file is None or args.w2v_file is None: if args.nn_weight_file is None or args.w2v_file is None: print('Please specify model files.') else: print('Loading models...') ts = time.time() # nn_model = atn.load_model(args.nn_architect_file, args.nn_weight_file) w2v_model = w2v.load_model(args.w2v_file) nn_model = reconstruct_nn(w2v_model) atn.load_model(nn_model, args.nn_weight_file) print('Models loaded ({:.1f} minutes)'.format((time.time() - ts) / 60)) while True: source = raw_input("EN> ") if source == '__EOT__': break target = nn_model.predict([cp.sentence_to_one_hot(source)]) print('CH> ' + cp.one_hot_to_sentence(w2v_model, target))
dim) + '.model' word2index = word2vec.run_word2vec(tokens, embedding_size=dim, algo=algo, model_path=model_path) if do_indexing == True: print('Start indexing') import word2vec for exp in EXPERIMENTS: print('Indexing:', exp) model_path = PATH_MODELS + 'model_' + exp['ALGO'] + '_' + str( exp['DIM']) + '.model' model = word2vec.load_model(model_path) tokens = pd.read_hdf(PATH_SELECTED + 'selected.h5') name = exp['NAME'] word2vec.encode_tokens(tokens, model, exp['DIM'], max_words=exp['VOCAB'], name=name, avg_vec=True, store=True, path=PATH_ENCODING) if do_plots == True: print('Plotting') import matplotlib
nn_model = atn.build_nn_model(nn_params) return nn_model if __name__ == '__main__': parser = argparse.ArgumentParser(description='Trading arguments') # parser.add_argument('-a', '--nn_architect_file', help='Neural network model file name') parser.add_argument('-w', '--nn_weight_file', help='Neural network model file name') parser.add_argument('-v', '--w2v_file', help='word2vec model file name') args = parser.parse_args() # if args.nn_architect_file is None or args.nn_weight_file is None or args.w2v_file is None: if args.nn_weight_file is None or args.w2v_file is None: print('Please specify model files.') else: print('Loading models...') ts = time.time() # nn_model = atn.load_model(args.nn_architect_file, args.nn_weight_file) w2v_model = w2v.load_model(args.w2v_file) nn_model = reconstruct_nn(w2v_model) atn.load_model(nn_model, args.nn_weight_file) print('Models loaded ({:.1f} minutes)'.format((time.time() - ts) / 60)) while True: source = raw_input("EN> ") if source == '__EOT__': break target = nn_model.predict([cp.sentence_to_one_hot(source)]) print('CH> ' + cp.one_hot_to_sentence(w2v_model, target))
def reload_word2vec(): global WORD2VEC_MODEL log("Starting download and load word2vec pretrained model.....") WORD2VEC_MODEL = w2v.load_model(WORD2VEC_MODEL_NAME) log("Finished word2vec loading.")