def start(self): algo_path = Instance.ALGO_PATH + '.' + self.config['algorithm'] module = importlib.import_module(algo_path) algorithm = getattr(module, self.config['algorithm']) logger.info("Imported module " + self.config['algorithm']) datasets = [] for item in self.config['datasets']: data = dataset.Dataset() path = os.path.join(Instance.DATASETS_PATH, item['path']) if ('type' in item.keys()): if (item['type'] == "tsv"): data.load_from_tsv(path) if (item['type'] == "csv"): data.load_from_csv(path) if (item['type'] == 'pkl'): data.load_from_pkl(path) if ('sanitizer' in item.keys()): clean = sanitizer.Sanitizer(data.to_pandas()) data.load_from_pandas(clean.pipeline(item['sanitizer'])) datasets.append(data) # TODO: check if schema of algo and dataset is valid context = self.init_spark_context() self.algo_instance = algorithm(context, tuple(datasets), self.config['parameters']) logger.info("A new instance of " + self.config['algorithm'] + "engine started")
def dataset(self, config): self.config = config local_config = {"augs_list": ["GaussNoise"], "aug_p": 1} batch_size = 2 ds = dataset.Dataset(config=config) ds() for i, df in enumerate([ds.train, ds.test]): local_config["include_target"] = True if i == 0 else False ds = generator.ItemGenerator(df, config, local_config) if self.config['dataset'].get("sampler", False): custom_sampler = utils.load_class(".".join( ["dataset", self.config['dataset']['sampler']])) else: custom_sampler = RandomSampler train_loader = DataLoader(ds, batch_size=batch_size, sampler=custom_sampler(data_source=df), num_workers=0, pin_memory=False) for i, dict_ in enumerate(train_loader): print("Max", dict_['img'].min().cpu().numpy(), "Min", dict_['img'].max().cpu().numpy(), "Mean", dict_['img'].mean().cpu().numpy(), "Target", dict_.get('target', "No target"), "Shape", dict_['img'].shape, dict_['id']) save_path = os.path.join(self.config['out_folder'], "img.npy") np.save(save_path, dict_['img'].cpu().numpy()) if i > 5: break
def create_testloader(self): # print(f"loading dataset...{self.candidatespairs}") testset = dset.Dataset(os.path.join(self.DEFAULT_DATAROOT, self.dataset), self.split, pairs=self.candidatespairs, klass=BasicTestingExample) testloader = DataLoader(testset, batch_size=100, num_workers=4) return testloader
def setup_data(self): # Initialize trainset dataroot = self.opts.dataroot _trainset = dset.Dataset(dataroot, 'train', pairs='annotated') self.trainloader = dat.DataLoader(_trainset, batch_size=self.opts.batch_size, shuffle=True, num_workers=4) # Use subset of train data if self.opts.train_size: # if --N: override the __len__ method of the dataset so that only the first N items will be used def train_size(unused): return self.opts.train_size _trainset.__class__.__len__ = train_size # Initialize testset if self.opts.do_validation: # Defatult True _testset = dset.Dataset(dataroot, 'test', pairs='annotated') if self.opts.split_zeroshot: # Split testset into seen and zeroshot sets test_sets = zeroshot.Splitter(_trainset, _testset).split() self.testloaders = [ dat.DataLoader(data, batch_size=len(data), num_workers=NUM_WORKERS) for data in test_sets ] else: # Use a single (unified) testset testdata = dat.DataLoader(_testset, batch_size=len(_testset), num_workers=NUM_WORKERS) self.testloaders = [testdata] if self.opts.val: # Use only x percent of the primary testset as validation (and don't use the rest at this time) dataset = self.testloaders[0].dataset n = int(len(dataset) * self.opts.val) sampler = dat.SubsetRandomSampler(torch.arange(n)) self.testloaders[0] = dat.DataLoader(dataset, batch_size=n, sampler=sampler, num_workers=NUM_WORKERS) else: # if --noval self.testloaders = []
def test_svm_gaussian(self): ## same input set, but I apply gaussian kernel on it x_train = np.array([[2, 0], [4, 0], [0, 2], [0, 4]]) y_train = np.array([0, 1, 0, 1]) x_train, train_min, train_max = d.Dataset().normalize(x_train, 5) x_train = svm.SupportVectorMachine.apply_gaussian_kernel(x_train, gamma=1) y_train = d.Dataset.transform_y_from_label_values_to_label_indices( y_train, 2) classifier = svm.SupportVectorMachine(number_input_features=1, number_labels=2, delta=5, random_seed=0) classifier.train(x_train, y_train, learning_rate=1, regularization_value=0.0, iterations=10000) x_test = np.array([[2, 0], [4, 0], [0, 2], [0, 4]]) y_test = np.array([0, 1, 0, 1]) #x_test = np.array([[3.05, 0], [2.04, 0], [0, 2.04], [0, 3.05]]) #y_test = np.array([1, 0, 0, 1]) x_test, _, _ = d.Dataset().normalize(x_test, train_min, train_max) x_test = svm.SupportVectorMachine.apply_gaussian_kernel(x_test, gamma=1) y_test = d.Dataset.transform_y_from_label_values_to_label_indices( y_test, 2) predicted = classifier.predict(x_test) _, percentage = classifier.get_predicted_correctly(predicted, y_test) assert percentage == 100
def test_svm_separates_classes_through_middle(self): # SVM is known that it doesn't stop converging when it gets a separation of the classes. # It continues to converge until it creates a large margin between the classes. # # In this method I train SVM with 4 points, and then # I test with 4 points which are much closer to the middle than the train points. # Predicting them correctly means that SVM chose the limit close to the middle # (close to maximum margin on both sides) x_train = np.array([[2, 0], [4, 0], [0, 2], [0, 4]]) y_train = np.array([0, 1, 0, 1]) x_train, train_min, train_max = d.Dataset().normalize(x_train, 5) y_train = d.Dataset.transform_y_from_label_values_to_label_indices( y_train, 2) classifier = svm.SupportVectorMachine(number_input_features=2, number_labels=2, delta=5, random_seed=0) classifier.train(x_train, y_train, learning_rate=0.1, regularization_value=0.0, iterations=10000) x_test = np.array([[3.01, 0], [2.99, 0], [0, 2.99], [0, 3.01]]) y_test = np.array([1, 0, 0, 1]) x_test, _, _ = d.Dataset().normalize(x_test, train_min, train_max) y_test = d.Dataset.transform_y_from_label_values_to_label_indices( y_test, 2) predicted = classifier.predict(x_test) _, percentage = classifier.get_predicted_correctly(predicted, y_test) assert percentage == 100
def main(_): dataset = dataset.Dataset(subset='train') assert dataset.data_files() ir_train = inception_resnet_v2.InceptionResnet('InceptionResnet') ir_train.train(dataset, 'train.ckpt', 0.001, num_classes=2, batch_size=32, max_steps=1000000000000, train_dir=cfg.TRAIN.train_dir, tower_name=cfg.TRAIN.tower_name, optname='adam', decay=0.9, momentum=0.9, epsilon=0.0000008, beta1=0.9, beta2=0.999, num_epoch_per_decay=30, lr_decay_factor=0.16)
def build_train_and_test(x_train, y_train, with_bias): x_train, _, _ = d.Dataset().normalize(x_train) y_train = d.Dataset.transform_y_from_label_values_to_label_indices( y_train, 2) classifier = LinearClassifier([x_train.shape[1], y_train.shape[1]], with_bias=with_bias, regularization_value=0.01, random_seed=0) classifier.train(x_train, y_train, learning_rate=0.1, iterations=500, debug_enabled=True) # the network should be able predict all train cases correctly predicted = classifier.predict(x_train) _, percentage = classifier.get_predicted_correctly(predicted, y_train) return percentage
def generate(self): print("=> generate config") self.delete_list = [] # global self._print_title("global") # create model save dir if not os.path.isdir(self.config.model.save_dir): os.makedirs(self.config.model.save_dir) # save config with open(os.path.join(self.config.model.save_dir, "config.txt"), "w") as f: json.dump(self.config, f, indent=2) print("config saved in " + os.path.join(self.config.model.save_dir, "config.txt")) # create result dir if not os.path.isdir( os.path.join("tmp", self.config.global_.result_dir)): os.makedirs(os.path.join("tmp", self.config.global_.result_dir)) assert not os.listdir( os.path.join("tmp", self.config.global_.result_dir) ), f"{os.path.join('tmp',self.config.global_.result_dir)} is not empty!" # gpu self._print_title("gpu") os.environ['CUDA_VISIBLE_DEVICES'] = self.config.global_.gpu print("Use", torch.cuda.device_count(), "GPUs!") if torch.cuda.is_available(): self.config.global_.device = "cuda" cudnn.benchmark = True print("cuda available, set config.global_.device to cuda") else: self.config.global_.device = "cpu" print("cuda not available, set config.global_.device to cpu") # transform self._print_title("transform") # train train_transform_list = [ transforms.Resize(self.config.model.input_size[1:]) ] if self.config.train.transform['random_horizontal_flip']: train_transform_list.append(transforms.RandomHorizontalFlip()) train_transform_list.extend( [transforms.Grayscale(), transforms.ToTensor()]) if self.config.train.transform['normalize']: train_transform_list.append( transforms.Normalize(mean=[0.5], std=[0.5])) train_transform = transforms.Compose(train_transform_list) print(f"train_transform:\n{train_transform}") # test test_transform_list = [ transforms.Resize(self.config.model.input_size[1:]) ] test_transform_list.extend( [transforms.Grayscale(), transforms.ToTensor()]) if self.config.train.transform['normalize']: test_transform_list.append( transforms.Normalize(mean=[0.5], std=[0.5])) test_transform = transforms.Compose(test_transform_list) print(f"test_transform:\n{test_transform}") self.config.train.transform = train_transform self.config.test.transform = test_transform # dataset self._print_title("dataset") # train image_list, class_name_map = dataset.list_all_image( self.config.train.dataset.path, f"tmp/{self.config.global_.result_dir}/webface.txt") train_dataset = dataset.Dataset(image_list, class_name_path=class_name_map, transform=self.config.train.transform) train_dataset_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.config.train.batch_size, shuffle=True) print(f"train_dataset: {train_dataset}") print(f"image_list file path: {image_list}") print(f"class_name_map file path: {class_name_map}") self.config.train.dataset = train_dataset self.config.train.loader = train_dataset_loader self.delete_list.extend([image_list, class_name_map]) # test test_dataset_dict = edict() test_loader_dict = edict() for test_pair in self.config.test.dataset: test_dataset = dataset.Dataset( test_pair['path'], transform=self.config.test.transform) test_dataset_loader = torch.utils.data.DataLoader( test_dataset, batch_size=self.config.test.batch_size, shuffle=False) test_dataset_dict[test_pair['name']] = test_dataset test_loader_dict[test_pair['name']] = test_dataset_loader print(f"{test_pair['name']}: {test_dataset}") self.config.test.dataset = test_dataset_dict self.config.test.loader = test_loader_dict # model self._print_title("model") # backbone if self.config.model.backbone == "resnet_18_ir": self.config.model.backbone = resnet_ir.ResNet18_IR( self.config.model.input_size, self.config.model.feature_dim) elif self.config.model.backbone == "resnet_34_ir": self.config.model.backbone = resnet_ir.ResNet34_IR( self.config.model.input_size, self.config.model.feature_dim) elif self.config.model.backbone == "resnet_50_ir": self.config.model.backbone = resnet_ir.ResNet50_IR( self.config.model.input_size, self.config.model.feature_dim) else: raise RuntimeError( f"Invalid Backbone Option {self.config.model.backbone}") self.config.model.backbone.to(self.config.global_.device) if self.config.global_.device == "cuda": self.config.model.backbone = torch.nn.DataParallel( self.config.model.backbone) with open( os.path.join(self.config.model.save_dir, "model_structure.txt"), "w") as f: f.write(str(self.config.model.backbone)) print("model structure saved in " + os.path.join(self.config.model.save_dir, "model_structure.txt")) # loss function if self.config.model.loss == "arcloss": self.config.model.loss = loss.ArcLoss( self.config.model.feature_dim, len(self.config.train.dataset.classes), **self.config.model.loss_param) elif self.config.model.loss == "cosloss": self.config.model.loss = loss.CosLoss( self.config.model.feature_dim, len(self.config.train.dataset.classes), **self.config.model.loss_param) elif self.config.model.loss == "mixloss": self.config.model.loss = loss.MixLoss( self.config.model.feature_dim, len(self.config.train.dataset.classes), **self.config.model.loss_param) elif self.config.model.loss == "sphereloss": self.config.model.loss = loss.SphereLoss( self.config.model.feature_dim, len(self.config.train.dataset.classes), **self.config.model.loss_param) elif self.config.model.loss == "normsoftmaxsloss": self.config.model.loss = loss.NormSoftmaxLoss( self.config.model.feature_dim, len(self.config.train.dataset.classes), **self.config.model.loss_param) elif self.config.model.loss == "softmaxsloss": self.config.model.loss = loss.SoftmaxLoss( self.config.model.feature_dim, len(self.config.train.dataset.classes), **self.config.model.loss_param) else: raise RuntimeError(f"Invalid Loss Option {self.config.model.loss}") self.config.model.loss.to(self.config.global_.device) if self.config.global_.device == "cuda": self.config.model.loss = torch.nn.DataParallel( self.config.model.loss) print(f"Loss: {self.config.model.loss}") print( f"model param {sum([p.numel() for p in self.config.model.backbone.parameters()])/1024/1024:.8f}M margin param {sum([p.numel() for p in self.config.model.loss.parameters()])/1024/1024:.8f}M" ) # optimizer self._print_title("optimizer") if self.config.train.optimizer == "SGD": optimizer = torch.optim.SGD( [{ 'params': self.config.model.backbone.parameters() }, { 'params': self.config.model.loss.parameters() }], lr=1e-5, weight_decay=self.config.train.weight_decay, momentum=self.config.train.momentum) else: raise RuntimeError( f"Invalid Optimizer Option {self.config.train.optimizer}") print(f"optimizer: {optimizer}") self.config.train.optimizer = optimizer # megaface if self.config.megaface.enable: self._print_title("megaface") # init assert os.path.isdir( self.config.megaface.devkit_dir ), f"{self.config.megaface.devkit_dir} not exist or is not a directory!" assert os.path.isdir( self.config.megaface.megaface_dataset_dir ), f"{self.config.megaface.megaface_dataset_dir} not exist or is not a directory!" assert os.path.isdir( self.config.megaface.facescrub_dataset_dir ), f"{facescrub_dataset_dir} not exist or is not a directory!" if not os.path.isdir(self.config.megaface.feature_save_dir): os.makedirs(self.config.megaface.feature_save_dir) if not os.path.isdir(self.config.megaface.result_save_dir): os.makedirs(self.config.megaface.result_save_dir) assert not os.listdir( self.config.megaface.feature_save_dir ), f"{self.config.megaface.feature_save_dir} is not empty!" assert not os.listdir( self.config.megaface.result_save_dir ), f"{self.config.megaface.result_save_dir} is not empty!" # probe if self.config.megaface.no_noise: probe_list_file = os.path.join( self.config.megaface.devkit_dir, "templatelists/facescrub_features_list_no_noise.json") else: probe_list_file = os.path.join( self.config.megaface.devkit_dir, "templatelists/facescrub_features_list.json") probe_file = open( f"tmp/{self.config.global_.result_dir}/probe.txt", 'w') with open(probe_list_file) as f: probe_list = json.load(f)['path'] # print(f"open proble list file, total {len(probe_list)} probe images") for line in probe_list: probe_file.write( os.path.join( self.config.megaface.facescrub_dataset_dir, line) + '\n') probe_file.close() self.delete_list.append( f"tmp/{self.config.global_.result_dir}/probe.txt") probe_dataset = dataset.Dataset( f"tmp/{self.config.global_.result_dir}/probe.txt", transform=self.config.test.transform) print(len(probe_dataset)) probe_dataset_loader = torch.utils.data.DataLoader( probe_dataset, batch_size=self.config.test.batch_size, shuffle=False) print(len(probe_dataset_loader)) probe = edict() probe.file = probe_list_file probe.dataset = probe_dataset probe.loader = probe_dataset_loader self.config.megaface.probe = probe print(f"probe dataset: {self.config.megaface.probe.dataset}") # distractor self.config.megaface.distractor = edict() for s in self.config.megaface.scale: if self.config.megaface.no_noise: distractor_list_file = os.path.join( self.config.megaface.devkit_dir, f"templatelists/megaface_features_list_no_noise.json_{s}_1" ) else: distractor_list_file = os.path.join( self.config.megaface.devkit_dir, f"templatelists/megaface_features_list.json_{s}_1") distractor_file = open( f"tmp/{self.config.global_.result_dir}/distractor_{s}.txt", 'w') with open(distractor_list_file) as f: distractor_list = json.load(f)['path'] # print(f"open distractor list file scale {s}, total {len(distractor_list)} distractor images") for line in distractor_list: distractor_file.write( os.path.join( self.config.megaface.megaface_dataset_dir, line) + '\n') distractor_file.close() self.delete_list.append( f"tmp/{self.config.global_.result_dir}/distractor_{s}.txt") distractor_dataset = dataset.Dataset( f"tmp/{self.config.global_.result_dir}/distractor_{s}.txt", transform=self.config.test.transform) distractor_dataset_loader = torch.utils.data.DataLoader( distractor_dataset, batch_size=self.config.test.batch_size, shuffle=False) distractor = edict() distractor.file = distractor_list_file distractor.dataset = distractor_dataset distractor.loader = distractor_dataset_loader self.config.megaface.distractor[str(s)] = distractor print(f"distractor dataset scale {s}: {distractor_dataset}") # delete self._print_title("delete list") for line in self.delete_list: print(line) if self.config.global_.auto_clear: os.remove(line) print("Remark: The generated files above will be auto cleared if set") print()
def recall_from_matlab(self, model): # save to .mat, call infer_from_scores.m to evaluate recall # assume model is given by runner, use it to run predictions from both annotated and Lu-candidates testset # also requires experiment name # return a dictionary of recalls{'seen/unseen_predicate/phrase/relationship'} # update model and put in eval() mode self.update_model(model) self.model.eval() # settings, should always be in this order for testing settings = ['annotated', 'Lu-candidates'] # settings = ['annotated'] # save the predictions _testset = {} testdata = {} for setting in settings: # print(f'loading datasets...{setting}') # initialize dataloaders for both testset _testset[setting] = dset.Dataset(os.path.join( self.DEFAULT_DATAROOT, self.dataset), 'test', pairs=setting, klass=BasicTestingExample) testdata[setting] = DataLoader(_testset[setting], batch_size=100, num_workers=4) # run prediction for each and save in .mat # print(f'calculating scores...') for testbatch in testdata[setting]: with torch.no_grad(): scores = self.model( torch.autograd.Variable(testbatch['X'].cuda())) cur_prediction = self.prediction.get(setting, None) if type(cur_prediction) is torch.Tensor: self.prediction[setting] = torch.cat( (cur_prediction, scores.cpu()), 0) else: self.prediction[setting] = scores.cpu() # self.prediction[setting] = self.model(testbatch['X'].cuda()) # scores_cpu = self.prediction[setting].cpu() # print(f"size of {setting} is: {self.prediction[setting].shape}") scores_np = self.prediction[setting].data.numpy() mydict = {'scores': scores_np} # sanity check # print(mydict['scores']) # print(f"from dataset: {setting}\nshape: {mydict['scores'].shape}") # save to unrel folder as (ex) "/annotated_<dim>_<id>.mat" # print(f'saving .mat files...{setting}') scipy.io.savemat(os.path.join(self.SCORES_PATH, f'{setting}.mat'), mydict) # print(f"{setting}.mat file is saved") self.prediction = {} # use subprocess to run print('starting matlab...') rc = Popen( f"{self.UNREL_PATH}/run_recall.sh baseline full {self.SCORES_PATH}", shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True) rc_out = str(rc.stdout.read(), 'utf-8') # rc = Popen(f"{self.UNREL_PATH}/run_recall.sh baseline full {self.SCORES_PATH}", shell=True, stdin=PIPE, stdout=PIPE, stderr=STDOUT, close_fds=True, bufsize=1) # for line in iter(rc.stdout.readline,b''): # print(line) # rc.stdout.close() # rc.wait() # return {} results = [] data = rc_out.split('\n') for line in data[-8:-1]: data = line.split()[-1] if data[0] != 'z': results.append(line.split()[-1]) recalls = {} recalls['seen_predicate'] = results[0] recalls['seen_phrase'] = results[1] recalls['seen_relationship'] = results[2] recalls['unseen_predicate'] = results[3] recalls['unseen_phrase'] = results[4] recalls['unseen_relationship'] = results[5] return recalls