Python Dataset Examples, dataset.dataset.Dataset Python Examples

Example #1

0

Show file

File: ins.py Project: sushantadh/neuraxis

    def start(self):

        algo_path = Instance.ALGO_PATH + '.' + self.config['algorithm']

        module = importlib.import_module(algo_path)
        algorithm = getattr(module, self.config['algorithm'])

        logger.info("Imported module " + self.config['algorithm'])

        datasets = []
        for item in self.config['datasets']:
            data = dataset.Dataset()
            path = os.path.join(Instance.DATASETS_PATH, item['path'])
            if ('type' in item.keys()):
                if (item['type'] == "tsv"):
                    data.load_from_tsv(path)
                if (item['type'] == "csv"):
                    data.load_from_csv(path)
                if (item['type'] == 'pkl'):
                    data.load_from_pkl(path)

            if ('sanitizer' in item.keys()):
                clean = sanitizer.Sanitizer(data.to_pandas())
                data.load_from_pandas(clean.pipeline(item['sanitizer']))
            datasets.append(data)

        # TODO: check if schema of algo and dataset is valid

        context = self.init_spark_context()
        self.algo_instance = algorithm(context, tuple(datasets),
                                       self.config['parameters'])
        logger.info("A new instance of " + self.config['algorithm'] +
                    "engine started")

Example #2

0

Show file

    def dataset(self, config):
        self.config = config
        local_config = {"augs_list": ["GaussNoise"], "aug_p": 1}
        batch_size = 2
        ds = dataset.Dataset(config=config)
        ds()
        for i, df in enumerate([ds.train, ds.test]):
            local_config["include_target"] = True if i == 0 else False
            ds = generator.ItemGenerator(df, config, local_config)

            if self.config['dataset'].get("sampler", False):
                custom_sampler = utils.load_class(".".join(
                    ["dataset", self.config['dataset']['sampler']]))
            else:
                custom_sampler = RandomSampler
            train_loader = DataLoader(ds,
                                      batch_size=batch_size,
                                      sampler=custom_sampler(data_source=df),
                                      num_workers=0,
                                      pin_memory=False)

            for i, dict_ in enumerate(train_loader):
                print("Max", dict_['img'].min().cpu().numpy(), "Min",
                      dict_['img'].max().cpu().numpy(), "Mean",
                      dict_['img'].mean().cpu().numpy(), "Target",
                      dict_.get('target', "No target"), "Shape",
                      dict_['img'].shape, dict_['id'])
                save_path = os.path.join(self.config['out_folder'], "img.npy")
                np.save(save_path, dict_['img'].cpu().numpy())
                if i > 5:
                    break

Example #3

0

Show file

 def create_testloader(self):
     # print(f"loading dataset...{self.candidatespairs}")
     testset = dset.Dataset(os.path.join(self.DEFAULT_DATAROOT,
                                         self.dataset),
                            self.split,
                            pairs=self.candidatespairs,
                            klass=BasicTestingExample)
     testloader = DataLoader(testset, batch_size=100, num_workers=4)
     return testloader

Example #4

0

Show file

    def setup_data(self):
        # Initialize trainset
        dataroot = self.opts.dataroot
        _trainset = dset.Dataset(dataroot, 'train', pairs='annotated')
        self.trainloader = dat.DataLoader(_trainset,
                                          batch_size=self.opts.batch_size,
                                          shuffle=True,
                                          num_workers=4)

        # Use subset of train data
        if self.opts.train_size:  # if --N: override the __len__ method of the dataset so that only the first N items will be used

            def train_size(unused):
                return self.opts.train_size

            _trainset.__class__.__len__ = train_size

        # Initialize testset
        if self.opts.do_validation:  # Defatult True
            _testset = dset.Dataset(dataroot, 'test', pairs='annotated')
            if self.opts.split_zeroshot:  # Split testset into seen and zeroshot sets
                test_sets = zeroshot.Splitter(_trainset, _testset).split()
                self.testloaders = [
                    dat.DataLoader(data,
                                   batch_size=len(data),
                                   num_workers=NUM_WORKERS)
                    for data in test_sets
                ]
            else:  # Use a single (unified) testset
                testdata = dat.DataLoader(_testset,
                                          batch_size=len(_testset),
                                          num_workers=NUM_WORKERS)
                self.testloaders = [testdata]
            if self.opts.val:  # Use only x percent of the primary testset as validation (and don't use the rest at this time)
                dataset = self.testloaders[0].dataset
                n = int(len(dataset) * self.opts.val)
                sampler = dat.SubsetRandomSampler(torch.arange(n))
                self.testloaders[0] = dat.DataLoader(dataset,
                                                     batch_size=n,
                                                     sampler=sampler,
                                                     num_workers=NUM_WORKERS)
        else:  # if --noval
            self.testloaders = []

Example #5

0

Show file

    def test_svm_gaussian(self):
        ## same input set, but I apply gaussian kernel on it

        x_train = np.array([[2, 0], [4, 0], [0, 2], [0, 4]])
        y_train = np.array([0, 1, 0, 1])

        x_train, train_min, train_max = d.Dataset().normalize(x_train, 5)
        x_train = svm.SupportVectorMachine.apply_gaussian_kernel(x_train,
                                                                 gamma=1)

        y_train = d.Dataset.transform_y_from_label_values_to_label_indices(
            y_train, 2)

        classifier = svm.SupportVectorMachine(number_input_features=1,
                                              number_labels=2,
                                              delta=5,
                                              random_seed=0)

        classifier.train(x_train,
                         y_train,
                         learning_rate=1,
                         regularization_value=0.0,
                         iterations=10000)

        x_test = np.array([[2, 0], [4, 0], [0, 2], [0, 4]])
        y_test = np.array([0, 1, 0, 1])
        #x_test = np.array([[3.05, 0], [2.04, 0], [0, 2.04], [0, 3.05]])
        #y_test = np.array([1, 0, 0, 1])

        x_test, _, _ = d.Dataset().normalize(x_test, train_min, train_max)
        x_test = svm.SupportVectorMachine.apply_gaussian_kernel(x_test,
                                                                gamma=1)

        y_test = d.Dataset.transform_y_from_label_values_to_label_indices(
            y_test, 2)

        predicted = classifier.predict(x_test)
        _, percentage = classifier.get_predicted_correctly(predicted, y_test)
        assert percentage == 100

Example #6

0

Show file

    def test_svm_separates_classes_through_middle(self):
        # SVM is known that it doesn't stop converging when it gets a separation of the classes.
        # It continues to converge until it creates a large margin between the classes.
        #
        # In this method I train SVM with 4 points, and then
        # I test with 4 points which are much closer to the middle than the train points.
        # Predicting them correctly means that SVM chose the limit close to the middle
        # (close to maximum margin on both sides)

        x_train = np.array([[2, 0], [4, 0], [0, 2], [0, 4]])
        y_train = np.array([0, 1, 0, 1])

        x_train, train_min, train_max = d.Dataset().normalize(x_train, 5)
        y_train = d.Dataset.transform_y_from_label_values_to_label_indices(
            y_train, 2)

        classifier = svm.SupportVectorMachine(number_input_features=2,
                                              number_labels=2,
                                              delta=5,
                                              random_seed=0)

        classifier.train(x_train,
                         y_train,
                         learning_rate=0.1,
                         regularization_value=0.0,
                         iterations=10000)

        x_test = np.array([[3.01, 0], [2.99, 0], [0, 2.99], [0, 3.01]])
        y_test = np.array([1, 0, 0, 1])

        x_test, _, _ = d.Dataset().normalize(x_test, train_min, train_max)
        y_test = d.Dataset.transform_y_from_label_values_to_label_indices(
            y_test, 2)

        predicted = classifier.predict(x_test)
        _, percentage = classifier.get_predicted_correctly(predicted, y_test)
        assert percentage == 100

Example #7

0

Show file

def main(_):
    dataset = dataset.Dataset(subset='train')
    assert dataset.data_files()
    ir_train = inception_resnet_v2.InceptionResnet('InceptionResnet')
    ir_train.train(dataset,
                   'train.ckpt',
                   0.001,
                   num_classes=2,
                   batch_size=32,
                   max_steps=1000000000000,
                   train_dir=cfg.TRAIN.train_dir,
                   tower_name=cfg.TRAIN.tower_name,
                   optname='adam',
                   decay=0.9,
                   momentum=0.9,
                   epsilon=0.0000008,
                   beta1=0.9,
                   beta2=0.999,
                   num_epoch_per_decay=30,
                   lr_decay_factor=0.16)

Example #8

0

Show file

    def build_train_and_test(x_train, y_train, with_bias):
        x_train, _, _ = d.Dataset().normalize(x_train)
        y_train = d.Dataset.transform_y_from_label_values_to_label_indices(
            y_train, 2)

        classifier = LinearClassifier([x_train.shape[1], y_train.shape[1]],
                                      with_bias=with_bias,
                                      regularization_value=0.01,
                                      random_seed=0)

        classifier.train(x_train,
                         y_train,
                         learning_rate=0.1,
                         iterations=500,
                         debug_enabled=True)

        # the network should be able predict all train cases correctly
        predicted = classifier.predict(x_train)
        _, percentage = classifier.get_predicted_correctly(predicted, y_train)

        return percentage

Example #9

0

Show file

File: config.py Project: zllz4/Face-Recognition

    def generate(self):
        print("=> generate config")

        self.delete_list = []

        # global
        self._print_title("global")
        # create model save dir
        if not os.path.isdir(self.config.model.save_dir):
            os.makedirs(self.config.model.save_dir)
        # save config
        with open(os.path.join(self.config.model.save_dir, "config.txt"),
                  "w") as f:
            json.dump(self.config, f, indent=2)
        print("config saved in " +
              os.path.join(self.config.model.save_dir, "config.txt"))
        # create result dir
        if not os.path.isdir(
                os.path.join("tmp", self.config.global_.result_dir)):
            os.makedirs(os.path.join("tmp", self.config.global_.result_dir))
        assert not os.listdir(
            os.path.join("tmp", self.config.global_.result_dir)
        ), f"{os.path.join('tmp',self.config.global_.result_dir)} is not empty!"

        # gpu
        self._print_title("gpu")
        os.environ['CUDA_VISIBLE_DEVICES'] = self.config.global_.gpu
        print("Use", torch.cuda.device_count(), "GPUs!")
        if torch.cuda.is_available():
            self.config.global_.device = "cuda"
            cudnn.benchmark = True
            print("cuda available, set config.global_.device to cuda")
        else:
            self.config.global_.device = "cpu"
            print("cuda not available, set config.global_.device to cpu")

        # transform
        self._print_title("transform")
        # train
        train_transform_list = [
            transforms.Resize(self.config.model.input_size[1:])
        ]
        if self.config.train.transform['random_horizontal_flip']:
            train_transform_list.append(transforms.RandomHorizontalFlip())
        train_transform_list.extend(
            [transforms.Grayscale(),
             transforms.ToTensor()])
        if self.config.train.transform['normalize']:
            train_transform_list.append(
                transforms.Normalize(mean=[0.5], std=[0.5]))
        train_transform = transforms.Compose(train_transform_list)
        print(f"train_transform:\n{train_transform}")
        # test
        test_transform_list = [
            transforms.Resize(self.config.model.input_size[1:])
        ]
        test_transform_list.extend(
            [transforms.Grayscale(),
             transforms.ToTensor()])
        if self.config.train.transform['normalize']:
            test_transform_list.append(
                transforms.Normalize(mean=[0.5], std=[0.5]))
        test_transform = transforms.Compose(test_transform_list)
        print(f"test_transform:\n{test_transform}")
        self.config.train.transform = train_transform
        self.config.test.transform = test_transform

        # dataset
        self._print_title("dataset")
        # train
        image_list, class_name_map = dataset.list_all_image(
            self.config.train.dataset.path,
            f"tmp/{self.config.global_.result_dir}/webface.txt")
        train_dataset = dataset.Dataset(image_list,
                                        class_name_path=class_name_map,
                                        transform=self.config.train.transform)
        train_dataset_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.config.train.batch_size,
            shuffle=True)
        print(f"train_dataset: {train_dataset}")
        print(f"image_list file path: {image_list}")
        print(f"class_name_map file path: {class_name_map}")
        self.config.train.dataset = train_dataset
        self.config.train.loader = train_dataset_loader
        self.delete_list.extend([image_list, class_name_map])
        # test
        test_dataset_dict = edict()
        test_loader_dict = edict()
        for test_pair in self.config.test.dataset:
            test_dataset = dataset.Dataset(
                test_pair['path'], transform=self.config.test.transform)
            test_dataset_loader = torch.utils.data.DataLoader(
                test_dataset,
                batch_size=self.config.test.batch_size,
                shuffle=False)
            test_dataset_dict[test_pair['name']] = test_dataset
            test_loader_dict[test_pair['name']] = test_dataset_loader
            print(f"{test_pair['name']}: {test_dataset}")
        self.config.test.dataset = test_dataset_dict
        self.config.test.loader = test_loader_dict

        # model
        self._print_title("model")
        # backbone
        if self.config.model.backbone == "resnet_18_ir":
            self.config.model.backbone = resnet_ir.ResNet18_IR(
                self.config.model.input_size, self.config.model.feature_dim)
        elif self.config.model.backbone == "resnet_34_ir":
            self.config.model.backbone = resnet_ir.ResNet34_IR(
                self.config.model.input_size, self.config.model.feature_dim)
        elif self.config.model.backbone == "resnet_50_ir":
            self.config.model.backbone = resnet_ir.ResNet50_IR(
                self.config.model.input_size, self.config.model.feature_dim)
        else:
            raise RuntimeError(
                f"Invalid Backbone Option {self.config.model.backbone}")
        self.config.model.backbone.to(self.config.global_.device)
        if self.config.global_.device == "cuda":
            self.config.model.backbone = torch.nn.DataParallel(
                self.config.model.backbone)
        with open(
                os.path.join(self.config.model.save_dir,
                             "model_structure.txt"), "w") as f:
            f.write(str(self.config.model.backbone))
        print("model structure saved in " +
              os.path.join(self.config.model.save_dir, "model_structure.txt"))
        # loss function
        if self.config.model.loss == "arcloss":
            self.config.model.loss = loss.ArcLoss(
                self.config.model.feature_dim,
                len(self.config.train.dataset.classes),
                **self.config.model.loss_param)
        elif self.config.model.loss == "cosloss":
            self.config.model.loss = loss.CosLoss(
                self.config.model.feature_dim,
                len(self.config.train.dataset.classes),
                **self.config.model.loss_param)
        elif self.config.model.loss == "mixloss":
            self.config.model.loss = loss.MixLoss(
                self.config.model.feature_dim,
                len(self.config.train.dataset.classes),
                **self.config.model.loss_param)
        elif self.config.model.loss == "sphereloss":
            self.config.model.loss = loss.SphereLoss(
                self.config.model.feature_dim,
                len(self.config.train.dataset.classes),
                **self.config.model.loss_param)
        elif self.config.model.loss == "normsoftmaxsloss":
            self.config.model.loss = loss.NormSoftmaxLoss(
                self.config.model.feature_dim,
                len(self.config.train.dataset.classes),
                **self.config.model.loss_param)
        elif self.config.model.loss == "softmaxsloss":
            self.config.model.loss = loss.SoftmaxLoss(
                self.config.model.feature_dim,
                len(self.config.train.dataset.classes),
                **self.config.model.loss_param)
        else:
            raise RuntimeError(f"Invalid Loss Option {self.config.model.loss}")
        self.config.model.loss.to(self.config.global_.device)
        if self.config.global_.device == "cuda":
            self.config.model.loss = torch.nn.DataParallel(
                self.config.model.loss)
        print(f"Loss: {self.config.model.loss}")
        print(
            f"model param {sum([p.numel() for p in self.config.model.backbone.parameters()])/1024/1024:.8f}M margin param {sum([p.numel() for p in self.config.model.loss.parameters()])/1024/1024:.8f}M"
        )

        # optimizer
        self._print_title("optimizer")
        if self.config.train.optimizer == "SGD":
            optimizer = torch.optim.SGD(
                [{
                    'params': self.config.model.backbone.parameters()
                }, {
                    'params': self.config.model.loss.parameters()
                }],
                lr=1e-5,
                weight_decay=self.config.train.weight_decay,
                momentum=self.config.train.momentum)
        else:
            raise RuntimeError(
                f"Invalid Optimizer Option {self.config.train.optimizer}")
        print(f"optimizer: {optimizer}")
        self.config.train.optimizer = optimizer

        # megaface
        if self.config.megaface.enable:
            self._print_title("megaface")

            # init
            assert os.path.isdir(
                self.config.megaface.devkit_dir
            ), f"{self.config.megaface.devkit_dir} not exist or is not a directory!"
            assert os.path.isdir(
                self.config.megaface.megaface_dataset_dir
            ), f"{self.config.megaface.megaface_dataset_dir} not exist or is not a directory!"
            assert os.path.isdir(
                self.config.megaface.facescrub_dataset_dir
            ), f"{facescrub_dataset_dir} not exist or is not a directory!"
            if not os.path.isdir(self.config.megaface.feature_save_dir):
                os.makedirs(self.config.megaface.feature_save_dir)
            if not os.path.isdir(self.config.megaface.result_save_dir):
                os.makedirs(self.config.megaface.result_save_dir)
            assert not os.listdir(
                self.config.megaface.feature_save_dir
            ), f"{self.config.megaface.feature_save_dir} is not empty!"
            assert not os.listdir(
                self.config.megaface.result_save_dir
            ), f"{self.config.megaface.result_save_dir} is not empty!"

            # probe
            if self.config.megaface.no_noise:
                probe_list_file = os.path.join(
                    self.config.megaface.devkit_dir,
                    "templatelists/facescrub_features_list_no_noise.json")
            else:
                probe_list_file = os.path.join(
                    self.config.megaface.devkit_dir,
                    "templatelists/facescrub_features_list.json")

            probe_file = open(
                f"tmp/{self.config.global_.result_dir}/probe.txt", 'w')
            with open(probe_list_file) as f:
                probe_list = json.load(f)['path']
                # print(f"open proble list file, total {len(probe_list)} probe images")
                for line in probe_list:
                    probe_file.write(
                        os.path.join(
                            self.config.megaface.facescrub_dataset_dir, line) +
                        '\n')
            probe_file.close()

            self.delete_list.append(
                f"tmp/{self.config.global_.result_dir}/probe.txt")

            probe_dataset = dataset.Dataset(
                f"tmp/{self.config.global_.result_dir}/probe.txt",
                transform=self.config.test.transform)
            print(len(probe_dataset))
            probe_dataset_loader = torch.utils.data.DataLoader(
                probe_dataset,
                batch_size=self.config.test.batch_size,
                shuffle=False)
            print(len(probe_dataset_loader))
            probe = edict()
            probe.file = probe_list_file
            probe.dataset = probe_dataset
            probe.loader = probe_dataset_loader
            self.config.megaface.probe = probe
            print(f"probe dataset: {self.config.megaface.probe.dataset}")

            # distractor
            self.config.megaface.distractor = edict()
            for s in self.config.megaface.scale:
                if self.config.megaface.no_noise:
                    distractor_list_file = os.path.join(
                        self.config.megaface.devkit_dir,
                        f"templatelists/megaface_features_list_no_noise.json_{s}_1"
                    )
                else:
                    distractor_list_file = os.path.join(
                        self.config.megaface.devkit_dir,
                        f"templatelists/megaface_features_list.json_{s}_1")
                distractor_file = open(
                    f"tmp/{self.config.global_.result_dir}/distractor_{s}.txt",
                    'w')
                with open(distractor_list_file) as f:
                    distractor_list = json.load(f)['path']
                    # print(f"open distractor list file scale {s}, total {len(distractor_list)} distractor images")
                    for line in distractor_list:
                        distractor_file.write(
                            os.path.join(
                                self.config.megaface.megaface_dataset_dir,
                                line) + '\n')
                distractor_file.close()

                self.delete_list.append(
                    f"tmp/{self.config.global_.result_dir}/distractor_{s}.txt")

                distractor_dataset = dataset.Dataset(
                    f"tmp/{self.config.global_.result_dir}/distractor_{s}.txt",
                    transform=self.config.test.transform)
                distractor_dataset_loader = torch.utils.data.DataLoader(
                    distractor_dataset,
                    batch_size=self.config.test.batch_size,
                    shuffle=False)
                distractor = edict()
                distractor.file = distractor_list_file
                distractor.dataset = distractor_dataset
                distractor.loader = distractor_dataset_loader
                self.config.megaface.distractor[str(s)] = distractor
                print(f"distractor dataset scale {s}: {distractor_dataset}")

        # delete
        self._print_title("delete list")
        for line in self.delete_list:
            print(line)
            if self.config.global_.auto_clear:
                os.remove(line)
        print("Remark: The generated files above will be auto cleared if set")

        print()

Example #10

0

Show file

    def recall_from_matlab(self, model):
        # save to .mat, call infer_from_scores.m to evaluate recall
        # assume model is given by runner, use it to run predictions from both annotated and Lu-candidates testset
        # also requires experiment name
        # return a dictionary of recalls{'seen/unseen_predicate/phrase/relationship'}
        # update model and put in eval() mode
        self.update_model(model)
        self.model.eval()
        # settings, should always be in this order for testing
        settings = ['annotated', 'Lu-candidates']
        # settings = ['annotated']
        # save the predictions
        _testset = {}
        testdata = {}
        for setting in settings:
            # print(f'loading datasets...{setting}')
            # initialize dataloaders for both testset
            _testset[setting] = dset.Dataset(os.path.join(
                self.DEFAULT_DATAROOT, self.dataset),
                                             'test',
                                             pairs=setting,
                                             klass=BasicTestingExample)
            testdata[setting] = DataLoader(_testset[setting],
                                           batch_size=100,
                                           num_workers=4)
            # run prediction for each and save in .mat
            # print(f'calculating scores...')
            for testbatch in testdata[setting]:
                with torch.no_grad():
                    scores = self.model(
                        torch.autograd.Variable(testbatch['X'].cuda()))
                cur_prediction = self.prediction.get(setting, None)
                if type(cur_prediction) is torch.Tensor:
                    self.prediction[setting] = torch.cat(
                        (cur_prediction, scores.cpu()), 0)
                else:
                    self.prediction[setting] = scores.cpu()

                # self.prediction[setting] = self.model(testbatch['X'].cuda())
            # scores_cpu = self.prediction[setting].cpu()
            # print(f"size of {setting} is: {self.prediction[setting].shape}")
            scores_np = self.prediction[setting].data.numpy()
            mydict = {'scores': scores_np}
            # sanity check
            # print(mydict['scores'])
            # print(f"from dataset: {setting}\nshape: {mydict['scores'].shape}")
            # save to unrel folder as (ex) "/annotated_<dim>_<id>.mat"
            # print(f'saving .mat files...{setting}')
            scipy.io.savemat(os.path.join(self.SCORES_PATH, f'{setting}.mat'),
                             mydict)
            # print(f"{setting}.mat file is saved")
            self.prediction = {}

        # use subprocess to run
        print('starting matlab...')

        rc = Popen(
            f"{self.UNREL_PATH}/run_recall.sh baseline full {self.SCORES_PATH}",
            shell=True,
            stdin=PIPE,
            stdout=PIPE,
            stderr=STDOUT,
            close_fds=True)
        rc_out = str(rc.stdout.read(), 'utf-8')
        # rc = Popen(f"{self.UNREL_PATH}/run_recall.sh baseline full {self.SCORES_PATH}", shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True, bufsize=1)
        # for line in iter(rc.stdout.readline,b''):
        #     print(line)
        # rc.stdout.close()
        # rc.wait()
        # return {}

        results = []
        data = rc_out.split('\n')

        for line in data[-8:-1]:
            data = line.split()[-1]
            if data[0] != 'z':
                results.append(line.split()[-1])

        recalls = {}
        recalls['seen_predicate'] = results[0]
        recalls['seen_phrase'] = results[1]
        recalls['seen_relationship'] = results[2]
        recalls['unseen_predicate'] = results[3]
        recalls['unseen_phrase'] = results[4]
        recalls['unseen_relationship'] = results[5]

        return recalls