import numpy as np from pprint import pprint import config from builder import buildPlot from utils.file import loadJson from utils.manager import PathManager dataset = 'virushare-20' version = 32 model = 'ProtoNet' report_iter = 100 val_episode = 50 pm = PathManager(dataset, version=version, model_name=model) train_stat_data = loadJson(pm.doc() + 'train_stat.json') train_config = loadJson(pm.doc() + 'train.json') config.reloadArbitraryConfig(train_config, reload_config_list=['plot']) config.plot.Enabled = True vis = buildPlot(config.plot) pprint(train_config) train_metric = np.array(train_stat_data['train']['metrics']).reshape( (-1, report_iter)) # 此处假设metric_num=1 train_loss = np.array(train_stat_data['train']['loss']).reshape( (-1, report_iter)) # validate数据的总数与validate_episode有关 val_metric = np.array(train_stat_data['validate']['metrics']).reshape(
print(f"device: {cfg.deviceId()}") print('*'*50) ################################################ #----------------------定义数据------------------ ################################################ expand = False if loss_func=='nll' else True loss = t.nn.NLLLoss().cuda() \ if loss_func=='nll' else \ t.nn.MSELoss().cuda() printState('init managers...') train_path_manager = PathManager(dataset=data_folder, d_type='train', model_name=model_name, version=version) val_path_manager = PathManager(dataset=data_folder, d_type='validate', model_name=model_name, version=version) train_dataset = SeqFileDataset(train_path_manager.FileData(), train_path_manager.FileSeqLen(), N) val_dataset = SeqFileDataset(val_path_manager.FileData(), val_path_manager.FileSeqLen(), N) # train_dataset = ImageFileDataset(train_path_manager.FileData(), N, rd_crop_size=224) # val_dataset = ImageFileDataset(val_path_manager.FileData(), N, rd_crop_size=224)
print("Fine-tuning") print("FineTuning Epoch:", ft_epoch) print('Used dataset: %s'%data_folder) print('Version: %d'%version) print(f"{k}-shot {n}-way") print(f"device: {cfg.deviceId()}") print('*'*50) ################################################ #----------------------定义数据------------------ ################################################ printState('init managers...') test_path_manager = PathManager(dataset=data_folder, d_type=USED_SUB_DATASET, version=version) ################################################ #----------------------读取模型参数------------------ ################################################ model_cfg = TrainingConfigManager('./runConfig.json') modelParams = model_cfg.modelParams() LRDecayIters, LRDecayGamma, optimizer_type, \ weight_decay, loss_func_name, default_lr, lrs, \ taskBatchSize, criteria = model_cfg.trainingParams() test_dataset = SeqFileDataset(test_path_manager.FileData(),
# splitDatas(src=man.DatasetBase()+'train/', # dest=man.DatasetBase()+'validate/', # ratio=30, # mode='x', # is_dir=True) # splitDatas(src=man.DatasetBase()+'train/', # dest=man.DatasetBase()+'test/', # ratio=30, # mode='x', # is_dir=True) ################################################################ # 制作基于下标的数据集 ################################################################ for d_type in ['train', 'validate', 'test']: manager = PathManager(dataset='virushare-20-3gram-tfidf', d_type=d_type) makeDataFile(json_path=manager.Folder(), w2idx_path=manager.WordIndexMap(), seq_length_save_path=manager.FileSeqLen(), data_save_path=manager.FileData(), idx2cls_mapping_save_path=manager.FileIdx2Cls(), num_per_class=20, max_seq_len=700) ################################################################ # renameItemFolder('/home/asichurter/datasets/JSONs/LargePE-100-original/') # 统计序列长度分布 ################################################################ # apiStat('/home/asichurter/datasets/JSONs/HKS/all/',
for ck, cv in re.items(): for kk, kv in cv.items(): if kv > best_acc: best_acc = kv best_c = ck best_k = kk return best_c, best_k if __name__ == "__main__": epoch = 5000 seq_len = 50 n_cluster = 30 n_range = (15, 30) mng = PathManager("HKS-api") # # # findOptK(mng.WordEmbedMatrix(), k_range=(2,100)) # apiCluster(mng.WordEmbedMatrix(), mng.DataRoot()+"MarkovClusterMapping.json", cluster_num=n_cluster) # makeClusteredData(json_path=mng.Folder(), # cluster_path=mng.DataRoot()+"MarkovClusterMapping.json", # word_map_path=mng.WordIndexMap(), # dump_path=mng.DataRozot()+"MarkovClusteredData.npy", # max_len=seq_len) # scoreMarkovEpisode(clustered_data_path=mng.DataRoot()+"MarkovClusteredData.npy", # epoch=2000, # n_cluster=n_cluster, # maxlen=seq_len) # re = gridSearch(c_values=list(range(*n_range)), # k_values=[i*50 for i in range(1,11)],
# continue # # items = random.sample(candidates, num_constrain) # # for item in items: # shutil.copy(src_path+folder+'/'+item, # dst_path+folder+'/'+item) # # reporter.logSuccess() # # except Exception as e: # reporter.logError(entity=folder+'/'+item, # msg=str(e)) if __name__ == '__main__': manager = PathManager(dataset='virushare_20', d_type='all') ''' 调用顺序:extract -> mapping -> removeRedundance -> (ngram) -> apiStat -> stat_classes -> collect ''' # 根据PE的划分,从json数据集中选出选中的文件收集到文件夹中 #---------------------------------------------------------------- # src_path = '/home/asichurter/datasets/JSONs/jsons - 副本/' # json_list_path = '/home/asichurter/datasets/PEs/virushare-20-after-increm/all/' # dst_path = '/home/asichurter/datasets/JSONs/virushare-50-original/' # # item_list = [] # for folder in os.listdir(json_list_path): # item_list += os.listdir(json_list_path+folder+'/')
if padding: pad_matrix = np.zeros((1, model.wv.vectors.shape[1])) matrix = np.concatenate((pad_matrix, matrix), axis=0) for i, w in enumerate(model.wv.index2word): word2index[ w] = i + 1 if padding else i # 由于idx=0要留给padding,因此所有的下标都加1 word2index['<PAD>'] = 0 if save_matrix_path: np.save(save_matrix_path, matrix) if save_word2index_path: dumpJson(word2index, save_word2index_path) if save_matrix_path is None and save_word2index_path is None: return matrix, word2index printBulletin('Done') if __name__ == '__main__': manager = PathManager(dataset='HKS-api', d_type='all') # print(manager.FileData()) seqs = aggregateApiSequences(manager.Folder()) trainW2Vmodel(seqs, save_matrix_path=manager.WordEmbedMatrix(), save_word2index_path=manager.WordIndexMap(), size=128)
# -*- coding: utf-8 from preliminaries.dataset import makeDataFile from utils.manager import PathManager dataset_name = '...' item_per_family = 20 max_seq_len = 300 for d_type in ['train', 'validate', 'test']: manager = PathManager(dataset=dataset_name, d_type=d_type) makeDataFile(json_path=manager.Folder(), w2idx_path=manager.WordIndexMap(), seq_length_save_path=manager.FileSeqLen(), data_save_path=manager.FileData(), idx2cls_mapping_save_path=manager.FileIdx2Cls(), num_per_class=item_per_family, max_seq_len=max_seq_len)
# log_dump_path='/home/omnisky/NewAsichurter/FusionData/reports/Per40_3gram_stat.json') # # mapAndExtractTopKNgram(dir_path='/home/omnisky/NewAsichurter/FusionData/api/LargePE-API-Per40/', # ngram_stat_log_path='/home/omnisky/NewAsichurter/FusionData/reports/Per40_3gram_stat.json', # K=5000, # N=3, # class_dir=True, # map_dump_path='/home/omnisky/NewAsichurter/FusionData/reports/Per40_3gram_map.json') # makeDatasetDirStruct(base_path='/home/omnisky/NewAsichurter/FusionData/datasets/LargePE-Per35/') # # sampleClassWiseData(dst_path='/home/omnisky/NewAsichurter/FusionData/datasets/LargePE-Per35/all/api/', # log_file_path="/home/omnisky/NewAsichurter/FusionData/reports/class_stat_after_processing_log.json", # num_per_class=35) # pm = PathManager(dataset="virushare-20") # trainGloVe(base_path=pm.rootBase(), # dataset='virushare-20', # size=300, # type='all') # renamePEbyMD5fromApi(api_dir_path='/home/omnisky/NewAsichurter/FusionData/datasets/LargePE-Per40/all/api/', # pe_dir_path='/home/omnisky/NewAsichurter/FusionData/datasets/LargePE-Per40/all/pe/') # convertDir2Image(dir_path='F:/FSL_mal_data/datasets/virushare-20/all/pe/', # dst_path='F:/FSL_mal_data/datasets/virushare-20/all/img/') # splitDataset(dataset_path=pm.datasetBase(), # validate_ratio=20, # test_ratio=20)
# } # for ver, conf in tasks.items(): # machine.addTask('train', flatten_update_config=conf) # machine.addTask('test', flatten_update_config={ # 'task|version': ver, # 'load_type': 'best', # }) # machine.addTask('test', flatten_update_config={ # 'task|version': ver, # 'load_type': 'last', # }) s_ver, e_ver = 318, 326 for ver in range(s_ver, e_ver + 1): pm = PathManager(dataset='virushare-20', version=ver) if os.path.exists(pm.doc() + 'test_result.json'): os.remove(pm.doc() + 'test_result.json') machine.addTask('test', flatten_update_config={ 'task|version': ver, 'load_type': 'best', }) machine.addTask('test', flatten_update_config={ 'task|version': ver, 'load_type': 'last', }) # machine.addTask('train', # {
print('*' * 50) print('Model Name: %s' % model_type) print('Used dataset: %s' % data_folder) print('Version: %d' % version) print(f"{k}-shot {n}-way") print(f"device: {cfg.deviceId()}") print("Is Random Test:", MODEL_RANDOM_STATE) print('*' * 50) ################################################ #----------------------定义数据------------------ ################################################ printState('init managers...') test_path_manager = PathManager(dataset=data_folder, d_type=USED_SUB_DATASET, model_name=model_name, version=version) ################################################ #----------------------读取模型参数------------------ ################################################ param_config_path = 'runConfig.json' if MODEL_RANDOM_STATE else test_path_manager.Doc( ) + 'config.json' model_cfg = TrainingConfigManager(param_config_path) modelParams = model_cfg.modelParams() LRDecayIters, LRDecayGamma, optimizer_type,\ weight_decay, loss_func, default_lr, lrs, \ taskBatchSize, criteria = model_cfg.trainingParams()
for item in items: shutil.copy(pj(base_dataset_path, tp, folder, item), pj(new_dataset_path, tp, folder, item)) shutil.copy(base_dataset_path+'data/matrix.npy', new_dataset_path+'/data/matrix.npy') shutil.copy(base_dataset_path+'data/wordMap.json', new_dataset_path+'/data/wordMap.json') if __name__ == '__main__': original_dataset_name = 'virushare-20-3gram-tfidf' new_dataset_name = 'virushare-20-3gram-tfidf-general' N = 10 seq_len = 200 pm = PathManager(dataset=original_dataset_name) makeGeneralTestDataset(base_dataset_path=pm.DatasetBase(), new_dataset_path=pm.ParentBase()+new_dataset_name+'/', train_num_per_class=N, include_test=False) for d_type in ['train', 'validate', 'test']: manager = PathManager(dataset=new_dataset_name, d_type=d_type) makeDataFile(json_path=manager.Folder(), w2idx_path=manager.WordIndexMap(), seq_length_save_path=manager.FileSeqLen(), data_save_path=manager.FileData(), idx2cls_mapping_save_path=manager.FileIdx2Cls(), num_per_class=N, max_seq_len=seq_len)