def _splitDatas(src, dest, ratio, mode='x', is_dir=False): ''' 将生成的样本按比例随机抽样分割,并且移动到指定文件夹下,用于训练集和验证集的制作 src:源文件夹 dest:目标文件夹 ratio:分割比例或者最大数量 ''' assert mode in ['c', 'x'], '选择的模式错误,只能复制c或者剪切x' all_items = os.listdir(src) if ratio < 0: # 比例flag为负,代表选中所有 size = len(all_items) elif 1 > ratio > 0: size = int(len(all_items) * ratio) else: size = ratio assert len(all_items) >= size, '分割时,总数量没有要求的数量大!' random.seed(magicSeed()) samples_names = random.sample(all_items, size) num = 0 for item in tqdm(all_items): if item in samples_names: num += 1 path = src + item if mode == 'x': shutil.move(path, dest) else: if is_dir: shutil.copytree(src=path, dst=dest + item) else: shutil.copy(src=path, dst=dest)
def parseAndSampleDataset(scale_report_path, base_path, dst_path, num_per_class, checked=True): scale_report = loadJson(scale_report_path) for family_name in tqdm(scale_report): # 抽样满足数量规模的类 if len(scale_report[family_name]) >= num_per_class: random.seed(magicSeed()) candidates = random.sample(scale_report[family_name], num_per_class) if os.path.exists(dst_path+family_name+'/'): raise RuntimeError("%s 类的文件夹在目标路径中已存在!"%(family_name)) else: os.mkdir(dst_path+family_name+'/') for item in candidates: folder_name,item_name = item.split("/") full_item_name = item_name+'.'+folder_name shutil.copy(base_path+item, dst_path+family_name+'/'+full_item_name) if checked: reporter = Reporter() for folder in os.listdir(dst_path): if len(os.listdir(dst_path+folder+'/')) != num_per_class: reporter.logError(entity=folder, msg="数量不足预期: %d/%d"% (len(os.listdir(dst_path+folder+'/')), num_per_class)) else: reporter.logSuccess() reporter.report()
def splitMetaBatch(meta_data, meta_label, batch_num, max_sample_num, sample_num, meta_len=None): assert len(meta_data) == len(meta_label) == len(meta_len) index_pool = [i for i in range(len(meta_data))] meta_len = t.LongTensor(meta_len) for i in range(batch_num): # for each meta-mini-batch, sample certain items per class index = [] for start_i in range(0, len(meta_data), max_sample_num): rd.seed(magicSeed()) class_batch_index = rd.sample( index_pool[start_i:start_i + max_sample_num], sample_num) index += class_batch_index # print(index) if meta_len is not None: yield meta_data[index], meta_label[index], meta_len[index].tolist() else: yield meta_data[index], meta_label[index]
def getTaskSampler(self, label_space, seed=None): sampling_seed = magicSeed() if seed is None else seed self.SamplingSeedCache = sampling_seed k, qk, n, N = self.readParams() # rd.seed(task_seed) # seed_for_each_class = rd.sample(magicList(), len(label_space)) seed_for_each_class = randomList( num=len(label_space), seed=sampling_seed, allow_duplicate=True) # rd.sample(magicList(), len(label_space)) support_sampler = EpisodeSamlper(k, qk, N, seed_for_each_class, mode='support', label_space=label_space, shuffle=False) query_sampler = EpisodeSamlper( k, qk, N, seed_for_each_class, mode='query', label_space=label_space, shuffle=False) # make query set labels cluster return support_sampler, query_sampler
def getTaskSampler(self, label_space, isTrain, seed=None): task_seed = magicSeed() if seed is None else seed k, qk, n, N = self.readParams() seed_for_each_class = randomList( num=len(label_space), seed=task_seed, allow_duplicate=True) # rd.sample(magicList(), len(label_space)) support_sampler = ReptileSamlper(N, k, seed_for_each_class, mode='support', label_space=label_space) if isTrain: query_sampler = None else: query_sampler = ReptileSamlper(N, k, seed_for_each_class, mode='query', label_space=label_space) return support_sampler, query_sampler
def _makeBatchSampler(self, task_seq_seed=None, sampling_seq_seed=None): k, qk, n, N = self._readParams() if task_seq_seed is None: task_seq_seed = magicSeed() if sampling_seq_seed is None: sampling_seq_seed = magicSeed() # 采样任务序列种子 task_seq_seeds = randList(self.TotalEpoch, seed=task_seq_seed) # 采样采样序列种子 sampling_seq_seeds = randList(self.TotalEpoch, seed=sampling_seq_seed) # 利用任务序列种子采样标签空间 label_space_seq = [] for seed in task_seq_seeds: label_space = self._getLabelSpace(seed) label_space_seq.append(label_space) sampled_support_indexes_seq = [] sampled_query_indexes_seq = [] for label_space, sampling_seed in zip(label_space_seq, sampling_seq_seeds): # 对于一个标签空间,采样每个class内部的seed class_wise_seeds = randList(num=len(label_space), seed=sampling_seed, allow_duplicate=True) support_episode_items = [] query_episode_items = [] for class_, class_seed in zip(label_space, class_wise_seeds): perm = randPermutation(N, class_seed) # 排列前k个是支持集的类内偏移 support_items = [class_ * N + i for i in perm[:k]] # 排列第k到qk+k个是查询集的类内偏移 query_items = [class_ * N + i for i in perm[k:k + qk]] support_episode_items.extend(support_items) query_episode_items.extend(query_items) sampled_support_indexes_seq.append(support_episode_items) sampled_query_indexes_seq.append(query_episode_items) support_batch_sampler = BatchSampler(sampled_support_indexes_seq) query_batch_sampler = BatchSampler(sampled_query_indexes_seq) return support_batch_sampler, query_batch_sampler
def __iter__(self): batch = [] for c, instances in self.instances.items(): for i in instances: batch.append(self.N * c + i) if self.shuffle: rd.seed(magicSeed()) rd.shuffle(batch) return iter(batch)
def getLabelSpace(self, seed=None): seed = magicSeed() if seed is None else seed self.TaskSeedCache = seed rd.seed(seed) classes_list = [i for i in range(self.Dataset.ClassNum)] sampled_classes = rd.sample(classes_list, self.Params['n']) # print('label space: ', sampled_classes) return sampled_classes
def getRandomColor(num, more=False): rd.seed(magicSeed()) if more: candidates = list(cnames.keys()) choosed = rd.sample(candidates, num) else: choosed = rd.sample(colors, num) return choosed
def _getLabelSpace(self, seed=None): # n = self.Params['n'] # seed = magicSeed() if seed is None else seed # self.TaskSeedCache = seed # # sampled_classes = randPermutation(self.Dataset.TotalClassNum, seed)[:n] # # # print('label space: ', sampled_classes) # return sampled_classes seed = magicSeed() if seed is None else seed self.TaskSeedCache = seed rd.seed(seed) classes_list = [i for i in range(self.Dataset.TotalClassNum)] sampled_classes = rd.sample(classes_list, self.Params['n']) # print('label space: ', sampled_classes) return sampled_classes
def __init__(self, k, qk, N, class_seeds, mode, label_space, shuffle=False): ''' 用于组成训练时sample set/query set和测试时support set和test set的采样器\n sample和query来自相同的类中,均为采样得到的\n :param label_space: 选到的要采样的类 :param N: 每个类的最大样本数量 :param shuffle: 是否随机打乱顺序 ''' self.LabelSpace = label_space self.N = N self.shuffle = shuffle self.instances = dict.fromkeys(label_space) if mode == 'support': # 为每一个类,根据其种子生成抽样样本的下标 for cla, seed in zip(label_space, class_seeds): rd.seed(seed) # 注入该类对应的种子,然后抽样训练集 self.instances[cla] = set(rd.sample([i for i in range(N)], k)) elif mode == 'query': for cla, seed in zip(label_space, class_seeds): rd.seed(seed) # 注入与训练集同类的种子,保证训练集采样相同 train_instances = set(rd.sample([i for i in range(N)], k)) # 查询集与训练集不同 test_instances = set([i for i in range(N) ]).difference(train_instances) rd.seed(magicSeed()) self.instances[cla] = rd.sample(test_instances, qk) else: raise ValueError('不支持的类型: %s' % mode)
def getTaskSampler(label_space, k, qk, N): task_seed = magicSeed() seed_for_each_class = randomList( num=len(label_space), seed=task_seed, allow_duplicate=True) #rd.sample(magicList(), len(label_space)) support_sampler = EpisodeSamlper(k, qk, N, seed_for_each_class, mode='support', label_space=label_space, shuffle=False) query_sampler = EpisodeSamlper(k, qk, N, seed_for_each_class, mode='query', label_space=label_space, shuffle=True) return support_sampler, query_sampler
def _getTaskSampler(self, label_space, seed=None): sampling_seed = magicSeed() if seed is None else seed self.SamplingSeedCache = sampling_seed k, qk, n, N = self._readParams() seed_for_each_class = randList( num=len(label_space), # 采样每个class内部的seed seed=sampling_seed, allow_duplicate=True) support_sampler = EpisodeSampler(k, qk, N, label_space, seed_for_each_class, mode='support') query_sampler = EpisodeSampler( k, qk, N, label_space, seed_for_each_class, mode='query') # make query set labels cluster return support_sampler, query_sampler
def collectScaleClasses(data_path, dst_path, num_per_class=50, least_api_len=10, exception=['SINGLETON']): for folder in tqdm(os.listdir(data_path)): if folder in exception: continue # 过滤长度不及最低下限的样本 item_list = [] for item in os.listdir(data_path + folder + '/'): report = loadJson(data_path + folder + '/' + item) if len(report['apis']) >= least_api_len: item_list.append(item) if len(item_list) >= num_per_class: rd.seed(magicSeed()) candidate_list = rd.sample(item_list, num_per_class) os.mkdir(dst_path + folder + '/') for item in candidate_list: shutil.copy(data_path + folder + '/' + item, dst_path + folder + '/' + item)
from utils.magic import magicSeed # *********************************************************** data_dataset_name = "HKS" model_dataset_name = "HKS" dataset_subtype = 'test' model_name = 'ProtoNet' version = 308 N = 20 plot_option = 'entire'#'entire' k, n, qk = 10, 5, 10 figsize = (6,6) task_seed = magicSeed()#4160148##4488524#4524396 sampling_seed = 5791326#magicSeed()#4160164##4488540#4524414 # SIMPLE seed: 5331044 axis_off = True plot_support_independently = False max_plot_class = 20 selected_idxes = [6,8,10,16,18] reducer = 'tsne' # *************************************************************************** data_path_man = PathManager(dataset=data_dataset_name, d_type=dataset_subtype) model_path_man = PathManager(dataset=model_dataset_name, version=version, model_name=model_name)
def scoreEpisodeAlignment(matrix, acc_queue, process_id, epoch=1000, log_path=None, verbose=False): # if acc_dump_path is not None: # if not os.path.exists(acc_dump_path): # dumpIterable([], "acc", acc_dump_path) # acc_sum = loadJson(acc_dump_path)['acc'] # else: # acc_sum = [] # matrix = loadJson(str_path)['strings'] class_pool = list(range(len(matrix) // N)) item_pool = set(range(N)) out = sys.stdout if log_path is None else open(log_path, "w") tm = StepTimer(epoch) tm.begin() for i in range(epoch): print("Process", process_id, ":", "Epoch", i) supports = [] queries = [] task_seed = magicSeed() sampling_seed = magicSeed() class_seeds = nRandom(n, sampling_seed) label_space = sample(class_pool, n, task_seed) for cls, cseed in zip(label_space, class_seeds): support_idxes = sample(item_pool, k, cseed, return_set=True) query_idxes = sample(item_pool.difference(support_idxes), qk, cseed, return_set=True) support_idxes = np.array(list(support_idxes)) + N * cls query_idxes = np.array(list(query_idxes)) + N * cls supports += [matrix[i] for i in support_idxes] queries += [matrix[i] for i in query_idxes] correct_count = 0 for qi, query in enumerate(queries): scores = [] for si, support in enumerate(supports): if verbose: print("Process", process_id, ":", qi * n * k + si, "/", n * qk * k * n) scores.append(align(support, query, out)) scores = np.array(scores).reshape(n, k).sum(-1) predict = np.argmax(scores) correct_count += (predict == (qi // qk)) epoch_acc = correct_count / (n * qk) acc_queue.put(epoch_acc) # acc_sum.append(epoch_acc) # if acc_dump_path is not None: # dumpIterable(acc_sum, "acc", acc_dump_path) # print("acc=", epoch_acc) tm.step()
def sampleLabelSpace(dataset, n): rd.seed(magicSeed()) classes_list = [i for i in range(dataset.ClassNum)] sampled_classes = rd.sample(classes_list, n) return sampled_classes
def scoreMarkovEpisode(clustered_data_path, epoch=300, n_cluster=10, maxlen=1000, verbose=True): acc_hist = [] matrix = np.load(clustered_data_path) class_pool = list(range(len(matrix) // N)) item_pool = set(range(N)) tm = StepTimer(epoch) tm.begin() for i in range(epoch): if verbose: print("Epoch", i) supports = [] queries = [] task_seed = magicSeed() sampling_seed = magicSeed() class_seeds = nRandom(n, sampling_seed) label_space = sample(class_pool, n, task_seed) for cls, cseed in zip(label_space, class_seeds): support_idxes = sample(item_pool, k, cseed, return_set=True) query_idxes = sample(item_pool.difference(support_idxes), qk, cseed, return_set=True) support_idxes = np.array(list(support_idxes)) + N * cls query_idxes = np.array(list(query_idxes)) + N * cls supports += [matrix[i] for i in support_idxes] queries += [matrix[i] for i in query_idxes] supports = np.array(supports).reshape(n, k, -1) queries = np.array(queries).reshape(n, qk, -1) # 利用原数据的类簇转换序列,生成每个类的类簇转换矩阵 cluster_tran_mats = [] for cls in range(n): cluster_tran_mats.append( makeTranMatrix(supports[cls], n_cluster=n_cluster, maxlen=maxlen)) cluster_tran_mats = np.stack(cluster_tran_mats, axis=0) # 利用每个类的类簇转换矩阵,根据其中状态转移的最大值,转换为类别之间的转换序列 class_tran_seqs = [] for cls in range(n): for support in supports[cls]: class_tran_seqs.append( traverse(support, cluster_tran_mats, maxlen)) class_tran_seqs = np.stack(class_tran_seqs, axis=0).reshape(n, k, -1) # 根据类别之间的转换序列,生成每个类的类别转换矩阵 class_tran_mats = [] for cls in range(n): # 由于是类别之间的转换序列,因此类簇数量等于类别数量 class_tran_mats.append( makeTranMatrix(class_tran_seqs[cls], n_cluster=n, maxlen=maxlen)) class_tran_mats = np.stack(class_tran_mats, axis=0).reshape(n, n, n) query_class_tran_seqs = [] for cls in range(n): for query in queries[cls]: query_class_tran_seqs.append( traverse(query, cluster_tran_mats, maxlen)) acc_count = 0 for qi, query in enumerate(query_class_tran_seqs): # 返回的总分数最大的一个即为预测结果的类别 predict = np.argmax(scoreSeqOnTranMats(query, class_tran_mats)) acc_count += (predict == (qi // qk)) epoch_acc = acc_count / (qk * n) if verbose: print("Acc:", epoch_acc) tm.step(prt=verbose) acc_hist.append(epoch_acc) if verbose: print("\n") print("*" * 50) print("Avg acc:", sum(acc_hist) / epoch) print("95%% belief interval:", calBeliefeInterval(acc_hist)) print("Total consuming time:", tm.step(prt=False, end=True)) return sum(acc_hist) / epoch