Ejemplo n.º 1
0
def gridSearch(c_values, k_values, per_epoch=200):  # 网格搜索聚类类簇数量和截断长度
    re = {}
    for ci, c_num in enumerate(c_values):
        re[c_num] = {}
        for ki, k_num in enumerate(k_values):
            print(ci * len(k_values) + ki + 1, "/",
                  len(c_values) * len(k_values))
            mng = PathManager("virushare-20-original")
            # findOptK(mng.WordEmbedMatrix(), k_range=(2,100))
            apiCluster(mng.WordEmbedMatrix(),
                       mng.DataRoot() + "MarkovClusterMapping.json",
                       cluster_num=c_num)
            makeClusteredData(
                json_path=mng.Folder(),
                cluster_path=mng.DataRoot() + "MarkovClusterMapping.json",
                word_map_path=mng.WordIndexMap(),
                dump_path=mng.DataRoot() + "MarkovClusteredData.npy",
                max_len=k_num)
            a = scoreMarkovEpisode(clustered_data_path=mng.DataRoot() +
                                   "MarkovClusteredData.npy",
                                   epoch=per_epoch,
                                   n_cluster=c_num,
                                   maxlen=k_num,
                                   verbose=False)
            re[c_num][k_num] = a

    return re
Ejemplo n.º 2
0
else:
    model_cfg = TrainingConfigManager('../run/runConfig.json')

modelParams = model_cfg.modelParams()

dataset = SeqFileDataset(path_man.FileData(), path_man.FileSeqLen(), N=N)
dataloader = DataLoader(dataset,
                        batch_size=N,
                        collate_fn=batchSequenceWithoutPad)

if model_name != 'Random':
    state_dict = t.load(path_man.Model() + '_v%s.0' % version)
    word_matrix = state_dict['Embedding.weight']
else:
    word_matrix = t.Tensor(
        np.load(path_man.WordEmbedMatrix(), allow_pickle=True))

loss_fn = t.nn.NLLLoss().cuda()

if model_name == 'SIMPLE':
    model = SIMPLE(word_matrix, **modelParams)
    model.load_state_dict(state_dict)
elif model_name == 'FT':
    model = FT(class_n, loss_fn, word_matrix, **modelParams)
    model.load_state_dict(state_dict)
elif model_name == 'Random':
    model = FT(class_n, loss_fn, word_matrix, **modelParams)

model = model.cuda()
model.eval()
Ejemplo n.º 3
0
                               test_path_manager.FileSeqLen(),
                               N)

expand = True if loss_func_name == 'mse' else False


test_task = AdaptEpisodeTask(k, qk, n, N, test_dataset, cuda=True, expand=expand)

stat = TestStatManager(report_cycle=100)

################################################
#----------------------模型定义和初始化------------------
################################################

printState('init model...')
word_matrix = t.Tensor(np.load(test_path_manager.WordEmbedMatrix(), allow_pickle=True))

loss = t.nn.NLLLoss().cuda() if loss_func_name == 'nll' else t.nn.MSELoss().cuda()

model = FT(n=n, loss_fn=loss,
           pretrained_matrix=word_matrix, **modelParams)

# model.load_state_dict(state_dict)
model = model.cuda()

statParamNumber(model)

if os.path.exists(test_path_manager.Doc()):
    deleteDir(test_path_manager.Doc())
os.mkdir(test_path_manager.Doc())
shutil.copy('../models/FT.py', test_path_manager.Doc()+"FT.py")
Ejemplo n.º 4
0
    #                   dump_path=mng.DataRozot()+"MarkovClusteredData.npy",
    #                   max_len=seq_len)
    # scoreMarkovEpisode(clustered_data_path=mng.DataRoot()+"MarkovClusteredData.npy",
    #                    epoch=2000,
    #                    n_cluster=n_cluster,
    #                    maxlen=seq_len)

    # re = gridSearch(c_values=list(range(*n_range)),
    #                 k_values=[i*50 for i in range(1,11)],
    #                 per_epoch=1000)
    # dumpJson(re, mng.DataRoot()+"GSs/GridSearchResult-%dshot-%dway-virushare20.json"%(k,n))
    # re = loadJson(mng.DataRoot()+"GSs/GridSearchResult-%dshot-%dway-virushare20.json"%(k,n))
    # n_cluster, seq_len = extractBestParam(re)
    # n_cluster = int(n_cluster)
    # seq_len = int(seq_len)

    apiCluster(mng.WordEmbedMatrix(),
               mng.DataRoot() + "MarkovClusterMapping.json",
               cluster_num=n_cluster)
    makeClusteredData(json_path=mng.Folder(),
                      cluster_path=mng.DataRoot() +
                      "MarkovClusterMapping.json",
                      word_map_path=mng.WordIndexMap(),
                      dump_path=mng.DataRoot() + "MarkovClusteredData.npy",
                      max_len=seq_len)
    scoreMarkovEpisode(clustered_data_path=mng.DataRoot() +
                       "MarkovClusteredData.npy",
                       epoch=epoch,
                       n_cluster=n_cluster,
                       maxlen=seq_len)
Ejemplo n.º 5
0
    if padding:
        pad_matrix = np.zeros((1, model.wv.vectors.shape[1]))
        matrix = np.concatenate((pad_matrix, matrix), axis=0)

        for i, w in enumerate(model.wv.index2word):
            word2index[
                w] = i + 1 if padding else i  # 由于idx=0要留给padding,因此所有的下标都加1
        word2index['<PAD>'] = 0

    if save_matrix_path:
        np.save(save_matrix_path, matrix)

    if save_word2index_path:
        dumpJson(word2index, save_word2index_path)

    if save_matrix_path is None and save_word2index_path is None:
        return matrix, word2index

    printBulletin('Done')


if __name__ == '__main__':
    manager = PathManager(dataset='HKS-api', d_type='all')

    # print(manager.FileData())

    seqs = aggregateApiSequences(manager.Folder())
    trainW2Vmodel(seqs,
                  save_matrix_path=manager.WordEmbedMatrix(),
                  save_word2index_path=manager.WordIndexMap(),
                  size=128)
Ejemplo n.º 6
0
stat = TestStatManager()

################################################
#----------------------模型定义和初始化------------------
################################################

printState('init model...')
if not MODEL_RANDOM_STATE:
    state_dict = t.load(test_path_manager.Model(type=cfg.loadBest()))
    if model_type in ADAPTED_MODELS:
        word_matrix = state_dict['Learner.Embedding.weight']
    else:
        word_matrix = state_dict['Embedding.weight']
else:
    word_matrix = t.Tensor(
        np.load(test_path_manager.WordEmbedMatrix(), allow_pickle=True))
print("loading done...")

loss = t.nn.NLLLoss().cuda() if loss_func == 'nll' else t.nn.MSELoss().cuda()

if model_type == 'ProtoNet':
    model = ProtoNet(pretrained_matrix=word_matrix, **modelParams)
elif model_type == 'InductionNet':
    model = InductionNet(pretrained_matrix=word_matrix, **modelParams)
elif model_type == 'MetaSGD':
    model = MetaSGD(n=n,
                    loss_fn=loss,
                    pretrained_matrix=word_matrix,
                    **modelParams)
elif model_type == 'ATAML':
    model = ATAML(n=n,