Example #1
0
def gridSearch(c_values, k_values, per_epoch=200):  # 网格搜索聚类类簇数量和截断长度
    re = {}
    for ci, c_num in enumerate(c_values):
        re[c_num] = {}
        for ki, k_num in enumerate(k_values):
            print(ci * len(k_values) + ki + 1, "/",
                  len(c_values) * len(k_values))
            mng = PathManager("virushare-20-original")
            # findOptK(mng.WordEmbedMatrix(), k_range=(2,100))
            apiCluster(mng.WordEmbedMatrix(),
                       mng.DataRoot() + "MarkovClusterMapping.json",
                       cluster_num=c_num)
            makeClusteredData(
                json_path=mng.Folder(),
                cluster_path=mng.DataRoot() + "MarkovClusterMapping.json",
                word_map_path=mng.WordIndexMap(),
                dump_path=mng.DataRoot() + "MarkovClusteredData.npy",
                max_len=k_num)
            a = scoreMarkovEpisode(clustered_data_path=mng.DataRoot() +
                                   "MarkovClusteredData.npy",
                                   epoch=per_epoch,
                                   n_cluster=c_num,
                                   maxlen=k_num,
                                   verbose=False)
            re[c_num][k_num] = a

    return re
Example #2
0
#                     f.write(f"> {j+1}\n")
#                     f.write(fa_strs[j]+"\n")
#             except Exception as e:
#                 print(f"len={len(fa_strs)} i={i} j={j} msg={str(e)}")
#                 raise RuntimeError

if __name__ == '__main__':
    mng = PathManager("HKS-api")
    # apiCluster(mng.WordEmbedMatrix(), mng.DataRoot()+"CategoryMapping.json")
    # convertApiCategory(clst_path=mng.DataRoot()+"CategoryMapping.json",
    #                    word_map_path=mng.WordIndexMap(),
    #                    json_path=mng.DatasetBase()+'all-rmsub/',
    #                    str_dump_path=mng.DataRoot()+"CategorizedStringData(rmsub).json")
    # genFamilyProtoByMSA(str_path=mng.DataRoot()+"CategorizedStringData.json",
    #                     work_space="D:/datasets/virushare-20-original/data/family_protos/",
    #                     proto_dump_path=mng.DataRoot()+"FamilyProtos.txt")
    # scoreEpisodeAlignment(str_path=mng.DataRoot()+"CategorizedStringData(rmsub).json",
    #                       epoch=300,
    #                       log_path=mng.DataRoot()+'logs/runlog.txt',
    #                       acc_dump_path=mng.DataRoot()+"logs/Align-Virushare20-%dshot-%dway.json"%(k,n))
    multi_process_align(
        str_path=mng.DataRoot() + "CategorizedStringData(rmsub).json",
        epoch=1000,
        acc_dump_path=mng.DataRoot() + "logs/Align-HKS-%dshot-%dway.json" %
        (k, n),
        process_num=4)

    # accs = loadJson(mng.DataRoot()+"logs/Align-HKS-%dshot-%dway.json"%(k,n))['acc']
    # print("Avg acc:", sum(accs)/len(accs))
    # print("Interval:", calBeliefeInterval(accs))
    # print("Len:", len(accs))
Example #3
0
    #                   dump_path=mng.DataRozot()+"MarkovClusteredData.npy",
    #                   max_len=seq_len)
    # scoreMarkovEpisode(clustered_data_path=mng.DataRoot()+"MarkovClusteredData.npy",
    #                    epoch=2000,
    #                    n_cluster=n_cluster,
    #                    maxlen=seq_len)

    # re = gridSearch(c_values=list(range(*n_range)),
    #                 k_values=[i*50 for i in range(1,11)],
    #                 per_epoch=1000)
    # dumpJson(re, mng.DataRoot()+"GSs/GridSearchResult-%dshot-%dway-virushare20.json"%(k,n))
    # re = loadJson(mng.DataRoot()+"GSs/GridSearchResult-%dshot-%dway-virushare20.json"%(k,n))
    # n_cluster, seq_len = extractBestParam(re)
    # n_cluster = int(n_cluster)
    # seq_len = int(seq_len)

    apiCluster(mng.WordEmbedMatrix(),
               mng.DataRoot() + "MarkovClusterMapping.json",
               cluster_num=n_cluster)
    makeClusteredData(json_path=mng.Folder(),
                      cluster_path=mng.DataRoot() +
                      "MarkovClusterMapping.json",
                      word_map_path=mng.WordIndexMap(),
                      dump_path=mng.DataRoot() + "MarkovClusteredData.npy",
                      max_len=seq_len)
    scoreMarkovEpisode(clustered_data_path=mng.DataRoot() +
                       "MarkovClusteredData.npy",
                       epoch=epoch,
                       n_cluster=n_cluster,
                       maxlen=seq_len)