def gridSearch(c_values, k_values, per_epoch=200): # 网格搜索聚类类簇数量和截断长度 re = {} for ci, c_num in enumerate(c_values): re[c_num] = {} for ki, k_num in enumerate(k_values): print(ci * len(k_values) + ki + 1, "/", len(c_values) * len(k_values)) mng = PathManager("virushare-20-original") # findOptK(mng.WordEmbedMatrix(), k_range=(2,100)) apiCluster(mng.WordEmbedMatrix(), mng.DataRoot() + "MarkovClusterMapping.json", cluster_num=c_num) makeClusteredData( json_path=mng.Folder(), cluster_path=mng.DataRoot() + "MarkovClusterMapping.json", word_map_path=mng.WordIndexMap(), dump_path=mng.DataRoot() + "MarkovClusteredData.npy", max_len=k_num) a = scoreMarkovEpisode(clustered_data_path=mng.DataRoot() + "MarkovClusteredData.npy", epoch=per_epoch, n_cluster=c_num, maxlen=k_num, verbose=False) re[c_num][k_num] = a return re
# mode='x', # is_dir=True) # splitDatas(src=man.DatasetBase()+'train/', # dest=man.DatasetBase()+'test/', # ratio=30, # mode='x', # is_dir=True) ################################################################ # 制作基于下标的数据集 ################################################################ for d_type in ['train', 'validate', 'test']: manager = PathManager(dataset='virushare-20-3gram-tfidf', d_type=d_type) makeDataFile(json_path=manager.Folder(), w2idx_path=manager.WordIndexMap(), seq_length_save_path=manager.FileSeqLen(), data_save_path=manager.FileData(), idx2cls_mapping_save_path=manager.FileIdx2Cls(), num_per_class=20, max_seq_len=700) ################################################################ # renameItemFolder('/home/asichurter/datasets/JSONs/LargePE-100-original/') # 统计序列长度分布 ################################################################ # apiStat('/home/asichurter/datasets/JSONs/HKS/all/', # ratio_stairs=[50, 100, 200, 400, 500, 1000, 2000, 5000], # dump_report_path=None,#'/home/asichurter/datasets/reports/HKS_3gram_tfidf_api_report.json',#None,# # dump_apiset_path=None,#'/home/asichurter/datasets/reports/HKS_3gram_tfidf_api_set.json',#None
# dump_path=mng.DataRozot()+"MarkovClusteredData.npy", # max_len=seq_len) # scoreMarkovEpisode(clustered_data_path=mng.DataRoot()+"MarkovClusteredData.npy", # epoch=2000, # n_cluster=n_cluster, # maxlen=seq_len) # re = gridSearch(c_values=list(range(*n_range)), # k_values=[i*50 for i in range(1,11)], # per_epoch=1000) # dumpJson(re, mng.DataRoot()+"GSs/GridSearchResult-%dshot-%dway-virushare20.json"%(k,n)) # re = loadJson(mng.DataRoot()+"GSs/GridSearchResult-%dshot-%dway-virushare20.json"%(k,n)) # n_cluster, seq_len = extractBestParam(re) # n_cluster = int(n_cluster) # seq_len = int(seq_len) apiCluster(mng.WordEmbedMatrix(), mng.DataRoot() + "MarkovClusterMapping.json", cluster_num=n_cluster) makeClusteredData(json_path=mng.Folder(), cluster_path=mng.DataRoot() + "MarkovClusterMapping.json", word_map_path=mng.WordIndexMap(), dump_path=mng.DataRoot() + "MarkovClusteredData.npy", max_len=seq_len) scoreMarkovEpisode(clustered_data_path=mng.DataRoot() + "MarkovClusteredData.npy", epoch=epoch, n_cluster=n_cluster, maxlen=seq_len)
if padding: pad_matrix = np.zeros((1, model.wv.vectors.shape[1])) matrix = np.concatenate((pad_matrix, matrix), axis=0) for i, w in enumerate(model.wv.index2word): word2index[ w] = i + 1 if padding else i # 由于idx=0要留给padding,因此所有的下标都加1 word2index['<PAD>'] = 0 if save_matrix_path: np.save(save_matrix_path, matrix) if save_word2index_path: dumpJson(word2index, save_word2index_path) if save_matrix_path is None and save_word2index_path is None: return matrix, word2index printBulletin('Done') if __name__ == '__main__': manager = PathManager(dataset='HKS-api', d_type='all') # print(manager.FileData()) seqs = aggregateApiSequences(manager.Folder()) trainW2Vmodel(seqs, save_matrix_path=manager.WordEmbedMatrix(), save_word2index_path=manager.WordIndexMap(), size=128)