def gridSearch(c_values, k_values, per_epoch=200): # 网格搜索聚类类簇数量和截断长度 re = {} for ci, c_num in enumerate(c_values): re[c_num] = {} for ki, k_num in enumerate(k_values): print(ci * len(k_values) + ki + 1, "/", len(c_values) * len(k_values)) mng = PathManager("virushare-20-original") # findOptK(mng.WordEmbedMatrix(), k_range=(2,100)) apiCluster(mng.WordEmbedMatrix(), mng.DataRoot() + "MarkovClusterMapping.json", cluster_num=c_num) makeClusteredData( json_path=mng.Folder(), cluster_path=mng.DataRoot() + "MarkovClusterMapping.json", word_map_path=mng.WordIndexMap(), dump_path=mng.DataRoot() + "MarkovClusteredData.npy", max_len=k_num) a = scoreMarkovEpisode(clustered_data_path=mng.DataRoot() + "MarkovClusteredData.npy", epoch=per_epoch, n_cluster=c_num, maxlen=k_num, verbose=False) re[c_num][k_num] = a return re
else: model_cfg = TrainingConfigManager('../run/runConfig.json') modelParams = model_cfg.modelParams() dataset = SeqFileDataset(path_man.FileData(), path_man.FileSeqLen(), N=N) dataloader = DataLoader(dataset, batch_size=N, collate_fn=batchSequenceWithoutPad) if model_name != 'Random': state_dict = t.load(path_man.Model() + '_v%s.0' % version) word_matrix = state_dict['Embedding.weight'] else: word_matrix = t.Tensor( np.load(path_man.WordEmbedMatrix(), allow_pickle=True)) loss_fn = t.nn.NLLLoss().cuda() if model_name == 'SIMPLE': model = SIMPLE(word_matrix, **modelParams) model.load_state_dict(state_dict) elif model_name == 'FT': model = FT(class_n, loss_fn, word_matrix, **modelParams) model.load_state_dict(state_dict) elif model_name == 'Random': model = FT(class_n, loss_fn, word_matrix, **modelParams) model = model.cuda() model.eval()
test_path_manager.FileSeqLen(), N) expand = True if loss_func_name == 'mse' else False test_task = AdaptEpisodeTask(k, qk, n, N, test_dataset, cuda=True, expand=expand) stat = TestStatManager(report_cycle=100) ################################################ #----------------------模型定义和初始化------------------ ################################################ printState('init model...') word_matrix = t.Tensor(np.load(test_path_manager.WordEmbedMatrix(), allow_pickle=True)) loss = t.nn.NLLLoss().cuda() if loss_func_name == 'nll' else t.nn.MSELoss().cuda() model = FT(n=n, loss_fn=loss, pretrained_matrix=word_matrix, **modelParams) # model.load_state_dict(state_dict) model = model.cuda() statParamNumber(model) if os.path.exists(test_path_manager.Doc()): deleteDir(test_path_manager.Doc()) os.mkdir(test_path_manager.Doc()) shutil.copy('../models/FT.py', test_path_manager.Doc()+"FT.py")
# dump_path=mng.DataRozot()+"MarkovClusteredData.npy", # max_len=seq_len) # scoreMarkovEpisode(clustered_data_path=mng.DataRoot()+"MarkovClusteredData.npy", # epoch=2000, # n_cluster=n_cluster, # maxlen=seq_len) # re = gridSearch(c_values=list(range(*n_range)), # k_values=[i*50 for i in range(1,11)], # per_epoch=1000) # dumpJson(re, mng.DataRoot()+"GSs/GridSearchResult-%dshot-%dway-virushare20.json"%(k,n)) # re = loadJson(mng.DataRoot()+"GSs/GridSearchResult-%dshot-%dway-virushare20.json"%(k,n)) # n_cluster, seq_len = extractBestParam(re) # n_cluster = int(n_cluster) # seq_len = int(seq_len) apiCluster(mng.WordEmbedMatrix(), mng.DataRoot() + "MarkovClusterMapping.json", cluster_num=n_cluster) makeClusteredData(json_path=mng.Folder(), cluster_path=mng.DataRoot() + "MarkovClusterMapping.json", word_map_path=mng.WordIndexMap(), dump_path=mng.DataRoot() + "MarkovClusteredData.npy", max_len=seq_len) scoreMarkovEpisode(clustered_data_path=mng.DataRoot() + "MarkovClusteredData.npy", epoch=epoch, n_cluster=n_cluster, maxlen=seq_len)
if padding: pad_matrix = np.zeros((1, model.wv.vectors.shape[1])) matrix = np.concatenate((pad_matrix, matrix), axis=0) for i, w in enumerate(model.wv.index2word): word2index[ w] = i + 1 if padding else i # 由于idx=0要留给padding,因此所有的下标都加1 word2index['<PAD>'] = 0 if save_matrix_path: np.save(save_matrix_path, matrix) if save_word2index_path: dumpJson(word2index, save_word2index_path) if save_matrix_path is None and save_word2index_path is None: return matrix, word2index printBulletin('Done') if __name__ == '__main__': manager = PathManager(dataset='HKS-api', d_type='all') # print(manager.FileData()) seqs = aggregateApiSequences(manager.Folder()) trainW2Vmodel(seqs, save_matrix_path=manager.WordEmbedMatrix(), save_word2index_path=manager.WordIndexMap(), size=128)
stat = TestStatManager() ################################################ #----------------------模型定义和初始化------------------ ################################################ printState('init model...') if not MODEL_RANDOM_STATE: state_dict = t.load(test_path_manager.Model(type=cfg.loadBest())) if model_type in ADAPTED_MODELS: word_matrix = state_dict['Learner.Embedding.weight'] else: word_matrix = state_dict['Embedding.weight'] else: word_matrix = t.Tensor( np.load(test_path_manager.WordEmbedMatrix(), allow_pickle=True)) print("loading done...") loss = t.nn.NLLLoss().cuda() if loss_func == 'nll' else t.nn.MSELoss().cuda() if model_type == 'ProtoNet': model = ProtoNet(pretrained_matrix=word_matrix, **modelParams) elif model_type == 'InductionNet': model = InductionNet(pretrained_matrix=word_matrix, **modelParams) elif model_type == 'MetaSGD': model = MetaSGD(n=n, loss_fn=loss, pretrained_matrix=word_matrix, **modelParams) elif model_type == 'ATAML': model = ATAML(n=n,