Exemple #1
0
def _splitDatas(src, dest, ratio, mode='x', is_dir=False):
    '''
    将生成的样本按比例随机抽样分割,并且移动到指定文件夹下,用于训练集和验证集的制作
    src:源文件夹
    dest:目标文件夹
    ratio:分割比例或者最大数量
    '''
    assert mode in ['c', 'x'], '选择的模式错误,只能复制c或者剪切x'

    all_items = os.listdir(src)

    if ratio < 0:  # 比例flag为负,代表选中所有
        size = len(all_items)
    elif 1 > ratio > 0:
        size = int(len(all_items) * ratio)
    else:
        size = ratio

    assert len(all_items) >= size, '分割时,总数量没有要求的数量大!'

    random.seed(magicSeed())
    samples_names = random.sample(all_items, size)
    num = 0
    for item in tqdm(all_items):
        if item in samples_names:
            num += 1
            path = src + item
            if mode == 'x':
                shutil.move(path, dest)
            else:
                if is_dir:
                    shutil.copytree(src=path, dst=dest + item)
                else:
                    shutil.copy(src=path, dst=dest)
Exemple #2
0
def parseAndSampleDataset(scale_report_path,
                          base_path,
                          dst_path,
                          num_per_class,
                          checked=True):

    scale_report = loadJson(scale_report_path)

    for family_name in tqdm(scale_report):
        # 抽样满足数量规模的类
        if len(scale_report[family_name]) >= num_per_class:
            random.seed(magicSeed())
            candidates = random.sample(scale_report[family_name], num_per_class)

            if os.path.exists(dst_path+family_name+'/'):
                raise RuntimeError("%s 类的文件夹在目标路径中已存在!"%(family_name))
            else:
                os.mkdir(dst_path+family_name+'/')

            for item in candidates:
                folder_name,item_name = item.split("/")
                full_item_name = item_name+'.'+folder_name
                shutil.copy(base_path+item, dst_path+family_name+'/'+full_item_name)

    if checked:
        reporter = Reporter()
        for folder in os.listdir(dst_path):
            if len(os.listdir(dst_path+folder+'/')) != num_per_class:
                reporter.logError(entity=folder, msg="数量不足预期: %d/%d"%
                                                     (len(os.listdir(dst_path+folder+'/')), num_per_class))
            else:
                reporter.logSuccess()
        reporter.report()
Exemple #3
0
def splitMetaBatch(meta_data,
                   meta_label,
                   batch_num,
                   max_sample_num,
                   sample_num,
                   meta_len=None):

    assert len(meta_data) == len(meta_label) == len(meta_len)

    index_pool = [i for i in range(len(meta_data))]
    meta_len = t.LongTensor(meta_len)

    for i in range(batch_num):

        # for each meta-mini-batch, sample certain items per class
        index = []

        for start_i in range(0, len(meta_data), max_sample_num):
            rd.seed(magicSeed())
            class_batch_index = rd.sample(
                index_pool[start_i:start_i + max_sample_num], sample_num)
            index += class_batch_index

        # print(index)

        if meta_len is not None:
            yield meta_data[index], meta_label[index], meta_len[index].tolist()
        else:
            yield meta_data[index], meta_label[index]
Exemple #4
0
    def getTaskSampler(self, label_space, seed=None):
        sampling_seed = magicSeed() if seed is None else seed

        self.SamplingSeedCache = sampling_seed

        k, qk, n, N = self.readParams()

        # rd.seed(task_seed)
        # seed_for_each_class = rd.sample(magicList(), len(label_space))
        seed_for_each_class = randomList(
            num=len(label_space), seed=sampling_seed,
            allow_duplicate=True)  # rd.sample(magicList(), len(label_space))

        support_sampler = EpisodeSamlper(k,
                                         qk,
                                         N,
                                         seed_for_each_class,
                                         mode='support',
                                         label_space=label_space,
                                         shuffle=False)
        query_sampler = EpisodeSamlper(
            k,
            qk,
            N,
            seed_for_each_class,
            mode='query',
            label_space=label_space,
            shuffle=False)  # make query set labels cluster

        return support_sampler, query_sampler
Exemple #5
0
    def getTaskSampler(self, label_space, isTrain, seed=None):
        task_seed = magicSeed() if seed is None else seed
        k, qk, n, N = self.readParams()

        seed_for_each_class = randomList(
            num=len(label_space), seed=task_seed,
            allow_duplicate=True)  # rd.sample(magicList(), len(label_space))

        support_sampler = ReptileSamlper(N,
                                         k,
                                         seed_for_each_class,
                                         mode='support',
                                         label_space=label_space)

        if isTrain:
            query_sampler = None

        else:
            query_sampler = ReptileSamlper(N,
                                           k,
                                           seed_for_each_class,
                                           mode='query',
                                           label_space=label_space)

        return support_sampler, query_sampler
Exemple #6
0
    def _makeBatchSampler(self, task_seq_seed=None, sampling_seq_seed=None):
        k, qk, n, N = self._readParams()

        if task_seq_seed is None:
            task_seq_seed = magicSeed()
        if sampling_seq_seed is None:
            sampling_seq_seed = magicSeed()

        # 采样任务序列种子
        task_seq_seeds = randList(self.TotalEpoch, seed=task_seq_seed)
        # 采样采样序列种子
        sampling_seq_seeds = randList(self.TotalEpoch, seed=sampling_seq_seed)

        # 利用任务序列种子采样标签空间
        label_space_seq = []
        for seed in task_seq_seeds:
            label_space = self._getLabelSpace(seed)
            label_space_seq.append(label_space)

        sampled_support_indexes_seq = []
        sampled_query_indexes_seq = []
        for label_space, sampling_seed in zip(label_space_seq,
                                              sampling_seq_seeds):
            # 对于一个标签空间,采样每个class内部的seed
            class_wise_seeds = randList(num=len(label_space),
                                        seed=sampling_seed,
                                        allow_duplicate=True)
            support_episode_items = []
            query_episode_items = []
            for class_, class_seed in zip(label_space, class_wise_seeds):
                perm = randPermutation(N, class_seed)

                # 排列前k个是支持集的类内偏移
                support_items = [class_ * N + i for i in perm[:k]]
                # 排列第k到qk+k个是查询集的类内偏移
                query_items = [class_ * N + i for i in perm[k:k + qk]]

                support_episode_items.extend(support_items)
                query_episode_items.extend(query_items)

            sampled_support_indexes_seq.append(support_episode_items)
            sampled_query_indexes_seq.append(query_episode_items)

        support_batch_sampler = BatchSampler(sampled_support_indexes_seq)
        query_batch_sampler = BatchSampler(sampled_query_indexes_seq)

        return support_batch_sampler, query_batch_sampler
Exemple #7
0
 def __iter__(self):
     batch = []
     for c, instances in self.instances.items():
         for i in instances:
             batch.append(self.N * c + i)
     if self.shuffle:
         rd.seed(magicSeed())
         rd.shuffle(batch)
     return iter(batch)
Exemple #8
0
    def getLabelSpace(self, seed=None):
        seed = magicSeed() if seed is None else seed
        self.TaskSeedCache = seed

        rd.seed(seed)
        classes_list = [i for i in range(self.Dataset.ClassNum)]
        sampled_classes = rd.sample(classes_list, self.Params['n'])

        # print('label space: ', sampled_classes)
        return sampled_classes
Exemple #9
0
def getRandomColor(num, more=False):
    rd.seed(magicSeed())

    if more:
        candidates = list(cnames.keys())
        choosed = rd.sample(candidates, num)
    else:
        choosed = rd.sample(colors, num)

    return choosed
Exemple #10
0
    def _getLabelSpace(self, seed=None):
        # n = self.Params['n']
        # seed = magicSeed() if seed is None else seed
        # self.TaskSeedCache = seed
        #
        # sampled_classes = randPermutation(self.Dataset.TotalClassNum, seed)[:n]
        #
        # # print('label space: ', sampled_classes)
        # return sampled_classes

        seed = magicSeed() if seed is None else seed
        self.TaskSeedCache = seed

        rd.seed(seed)
        classes_list = [i for i in range(self.Dataset.TotalClassNum)]
        sampled_classes = rd.sample(classes_list, self.Params['n'])

        # print('label space: ', sampled_classes)
        return sampled_classes
Exemple #11
0
    def __init__(self,
                 k,
                 qk,
                 N,
                 class_seeds,
                 mode,
                 label_space,
                 shuffle=False):
        '''
        用于组成训练时sample set/query set和测试时support set和test set的采样器\n
        sample和query来自相同的类中,均为采样得到的\n
        :param label_space: 选到的要采样的类
        :param N: 每个类的最大样本数量
        :param shuffle: 是否随机打乱顺序
        '''
        self.LabelSpace = label_space
        self.N = N
        self.shuffle = shuffle
        self.instances = dict.fromkeys(label_space)

        if mode == 'support':
            # 为每一个类,根据其种子生成抽样样本的下标
            for cla, seed in zip(label_space, class_seeds):
                rd.seed(seed)
                # 注入该类对应的种子,然后抽样训练集
                self.instances[cla] = set(rd.sample([i for i in range(N)], k))

        elif mode == 'query':
            for cla, seed in zip(label_space, class_seeds):
                rd.seed(seed)
                # 注入与训练集同类的种子,保证训练集采样相同
                train_instances = set(rd.sample([i for i in range(N)], k))
                # 查询集与训练集不同
                test_instances = set([i for i in range(N)
                                      ]).difference(train_instances)

                rd.seed(magicSeed())
                self.instances[cla] = rd.sample(test_instances, qk)

        else:
            raise ValueError('不支持的类型: %s' % mode)
Exemple #12
0
def getTaskSampler(label_space, k, qk, N):
    task_seed = magicSeed()

    seed_for_each_class = randomList(
        num=len(label_space), seed=task_seed,
        allow_duplicate=True)  #rd.sample(magicList(), len(label_space))

    support_sampler = EpisodeSamlper(k,
                                     qk,
                                     N,
                                     seed_for_each_class,
                                     mode='support',
                                     label_space=label_space,
                                     shuffle=False)
    query_sampler = EpisodeSamlper(k,
                                   qk,
                                   N,
                                   seed_for_each_class,
                                   mode='query',
                                   label_space=label_space,
                                   shuffle=True)

    return support_sampler, query_sampler
Exemple #13
0
    def _getTaskSampler(self, label_space, seed=None):
        sampling_seed = magicSeed() if seed is None else seed

        self.SamplingSeedCache = sampling_seed

        k, qk, n, N = self._readParams()

        seed_for_each_class = randList(
            num=len(label_space),  # 采样每个class内部的seed
            seed=sampling_seed,
            allow_duplicate=True)

        support_sampler = EpisodeSampler(k,
                                         qk,
                                         N,
                                         label_space,
                                         seed_for_each_class,
                                         mode='support')
        query_sampler = EpisodeSampler(
            k, qk, N, label_space, seed_for_each_class,
            mode='query')  # make query set labels cluster

        return support_sampler, query_sampler
Exemple #14
0
def collectScaleClasses(data_path,
                        dst_path,
                        num_per_class=50,
                        least_api_len=10,
                        exception=['SINGLETON']):
    for folder in tqdm(os.listdir(data_path)):
        if folder in exception:
            continue

        # 过滤长度不及最低下限的样本
        item_list = []
        for item in os.listdir(data_path + folder + '/'):
            report = loadJson(data_path + folder + '/' + item)
            if len(report['apis']) >= least_api_len:
                item_list.append(item)

        if len(item_list) >= num_per_class:
            rd.seed(magicSeed())
            candidate_list = rd.sample(item_list, num_per_class)
            os.mkdir(dst_path + folder + '/')

            for item in candidate_list:
                shutil.copy(data_path + folder + '/' + item,
                            dst_path + folder + '/' + item)
from utils.magic import magicSeed

# ***********************************************************
data_dataset_name = "HKS"
model_dataset_name = "HKS"
dataset_subtype = 'test'

model_name = 'ProtoNet'

version = 308
N = 20
plot_option = 'entire'#'entire'
k, n, qk = 10, 5, 10
figsize = (6,6)

task_seed = magicSeed()#4160148##4488524#4524396
sampling_seed = 5791326#magicSeed()#4160164##4488540#4524414   # SIMPLE seed: 5331044

axis_off = True
plot_support_independently = False
max_plot_class = 20
selected_idxes = [6,8,10,16,18]
reducer = 'tsne'
# ***************************************************************************


data_path_man = PathManager(dataset=data_dataset_name,
                           d_type=dataset_subtype)
model_path_man = PathManager(dataset=model_dataset_name,
                             version=version,
                             model_name=model_name)
Exemple #16
0
def scoreEpisodeAlignment(matrix,
                          acc_queue,
                          process_id,
                          epoch=1000,
                          log_path=None,
                          verbose=False):
    # if acc_dump_path is not None:
    #     if not os.path.exists(acc_dump_path):
    #         dumpIterable([], "acc", acc_dump_path)
    #     acc_sum = loadJson(acc_dump_path)['acc']
    # else:
    #     acc_sum = []
    # matrix = loadJson(str_path)['strings']
    class_pool = list(range(len(matrix) // N))
    item_pool = set(range(N))
    out = sys.stdout if log_path is None else open(log_path, "w")
    tm = StepTimer(epoch)

    tm.begin()
    for i in range(epoch):
        print("Process", process_id, ":", "Epoch", i)
        supports = []
        queries = []

        task_seed = magicSeed()
        sampling_seed = magicSeed()
        class_seeds = nRandom(n, sampling_seed)

        label_space = sample(class_pool, n, task_seed)

        for cls, cseed in zip(label_space, class_seeds):
            support_idxes = sample(item_pool, k, cseed, return_set=True)
            query_idxes = sample(item_pool.difference(support_idxes),
                                 qk,
                                 cseed,
                                 return_set=True)

            support_idxes = np.array(list(support_idxes)) + N * cls
            query_idxes = np.array(list(query_idxes)) + N * cls

            supports += [matrix[i] for i in support_idxes]
            queries += [matrix[i] for i in query_idxes]

        correct_count = 0

        for qi, query in enumerate(queries):
            scores = []
            for si, support in enumerate(supports):
                if verbose:
                    print("Process", process_id, ":", qi * n * k + si, "/",
                          n * qk * k * n)
                scores.append(align(support, query, out))

            scores = np.array(scores).reshape(n, k).sum(-1)
            predict = np.argmax(scores)
            correct_count += (predict == (qi // qk))

        epoch_acc = correct_count / (n * qk)
        acc_queue.put(epoch_acc)
        # acc_sum.append(epoch_acc)
        # if acc_dump_path is not None:
        #     dumpIterable(acc_sum, "acc", acc_dump_path)

        # print("acc=", epoch_acc)
        tm.step()
Exemple #17
0
def sampleLabelSpace(dataset, n):
    rd.seed(magicSeed())
    classes_list = [i for i in range(dataset.ClassNum)]
    sampled_classes = rd.sample(classes_list, n)

    return sampled_classes
Exemple #18
0
def scoreMarkovEpisode(clustered_data_path,
                       epoch=300,
                       n_cluster=10,
                       maxlen=1000,
                       verbose=True):
    acc_hist = []
    matrix = np.load(clustered_data_path)
    class_pool = list(range(len(matrix) // N))
    item_pool = set(range(N))
    tm = StepTimer(epoch)

    tm.begin()
    for i in range(epoch):
        if verbose:
            print("Epoch", i)
        supports = []
        queries = []

        task_seed = magicSeed()
        sampling_seed = magicSeed()
        class_seeds = nRandom(n, sampling_seed)

        label_space = sample(class_pool, n, task_seed)

        for cls, cseed in zip(label_space, class_seeds):
            support_idxes = sample(item_pool, k, cseed, return_set=True)
            query_idxes = sample(item_pool.difference(support_idxes),
                                 qk,
                                 cseed,
                                 return_set=True)

            support_idxes = np.array(list(support_idxes)) + N * cls
            query_idxes = np.array(list(query_idxes)) + N * cls

            supports += [matrix[i] for i in support_idxes]
            queries += [matrix[i] for i in query_idxes]

        supports = np.array(supports).reshape(n, k, -1)
        queries = np.array(queries).reshape(n, qk, -1)

        # 利用原数据的类簇转换序列,生成每个类的类簇转换矩阵
        cluster_tran_mats = []
        for cls in range(n):
            cluster_tran_mats.append(
                makeTranMatrix(supports[cls],
                               n_cluster=n_cluster,
                               maxlen=maxlen))
        cluster_tran_mats = np.stack(cluster_tran_mats, axis=0)

        # 利用每个类的类簇转换矩阵,根据其中状态转移的最大值,转换为类别之间的转换序列
        class_tran_seqs = []
        for cls in range(n):
            for support in supports[cls]:
                class_tran_seqs.append(
                    traverse(support, cluster_tran_mats, maxlen))
        class_tran_seqs = np.stack(class_tran_seqs, axis=0).reshape(n, k, -1)

        # 根据类别之间的转换序列,生成每个类的类别转换矩阵
        class_tran_mats = []
        for cls in range(n):
            # 由于是类别之间的转换序列,因此类簇数量等于类别数量
            class_tran_mats.append(
                makeTranMatrix(class_tran_seqs[cls],
                               n_cluster=n,
                               maxlen=maxlen))
        class_tran_mats = np.stack(class_tran_mats, axis=0).reshape(n, n, n)

        query_class_tran_seqs = []
        for cls in range(n):
            for query in queries[cls]:
                query_class_tran_seqs.append(
                    traverse(query, cluster_tran_mats, maxlen))

        acc_count = 0
        for qi, query in enumerate(query_class_tran_seqs):
            # 返回的总分数最大的一个即为预测结果的类别
            predict = np.argmax(scoreSeqOnTranMats(query, class_tran_mats))
            acc_count += (predict == (qi // qk))

        epoch_acc = acc_count / (qk * n)
        if verbose:
            print("Acc:", epoch_acc)
        tm.step(prt=verbose)
        acc_hist.append(epoch_acc)

    if verbose:
        print("\n")
        print("*" * 50)
        print("Avg acc:", sum(acc_hist) / epoch)
        print("95%% belief interval:", calBeliefeInterval(acc_hist))
        print("Total consuming time:", tm.step(prt=False, end=True))

    return sum(acc_hist) / epoch