def main(args, _=None): global IMG_SIZE IMG_SIZE = (args.img_size, args.img_size) model = ResnetEncoder(arch=args.arch, pooling=args.pooling) model = model.eval() model, device = UtilsFactory.prepare_model(model) images_df = pd.read_csv(args.in_csv) images_df = images_df.reset_index().drop("index", axis=1) images_df = list(images_df.to_dict("index").values()) open_fn = ImageReader( input_key=args.img_col, output_key="image", datapath=args.datapath ) dataloader = UtilsFactory.create_loader( images_df, open_fn, batch_size=args.batch_size, num_workers=args.num_workers, dict_transform=dict_transformer ) features = [] dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for batch in dataloader: features_ = model(batch["image"].to(device)) features_ = features_.cpu().detach().numpy() features.append(features_) features = np.concatenate(features, axis=0) np.save(args.out_npy, features)
def main(args): model = models.__dict__[args.arch](pretrained=True) model = model.eval() model, device = UtilsFactory.prepare_model(model) labels = json.loads(open(args.labels).read()) i2k = Images2Keywords(model, args.n_keywords, labels) images_df = pd.read_csv(args.in_csv) images_df = images_df.reset_index().drop("index", axis=1) images_df = list(images_df.to_dict("index").values()) open_fn = ImageReader(input_key=args.img_col, output_key="image", datapath=args.datapath) dataloader = UtilsFactory.create_loader(images_df, open_fn, batch_size=args.batch_size, workers=args.n_workers, dict_transform=dict_transformer) keywords = [] dataloader = tqdm(dataloader) if args.verbose else dataloader with torch.no_grad(): for batch in dataloader: keywords_batch = i2k(batch["image"].to(device)) keywords += keywords_batch input_csv = pd.read_csv(args.in_csv) input_csv[args.keywords_col] = keywords input_csv.to_csv(args.out_csv, index=False)
def main(args): images_df = pd.read_csv(args.in_csv) images_df = images_df.reset_index().drop("index", axis=1) images_df = list(images_df.to_dict("index").values()) if args.fasttext_model is not None: encode_fn = create_fasttext_encode_fn(args.fasttext_model, normalize=args.normalize) elif args.w2v_model is not None: encode_fn = create_gensim_encode_fn(args.w2v_model, sep=args.txt_sep, normalize=args.normalize) else: raise NotImplementedError open_fn = LambdaReader(input_key=args.txt_col, output_key="txt", encode_fn=encode_fn) dataloader = UtilsFactory.create_loader(images_df, open_fn, batch_size=args.batch_size, workers=args.n_workers) features = [] dataloader = tqdm(dataloader) if args.verbose else dataloader for batch in dataloader: features_ = batch["txt"] features.append(features_) features = np.concatenate(features, axis=0) np.save(args.out_npy, features)
def create_loaders(self, train_df, val_df): train_loader = UtilsFactory.create_loader(train_df, open_fn=self.get_input_pair, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True) valid_loader = UtilsFactory.create_loader(val_df, open_fn=self.get_input_pair, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True) loaders = collections.OrderedDict() loaders['train'] = train_loader loaders['valid'] = valid_loader return loaders
def create_test_loaders(self, test_df): test_loader = UtilsFactory.create_loader(test_df, open_fn=self.get_input_pair, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True) loaders = collections.OrderedDict() loaders['test'] = test_df return loaders
def create_loaders(self, train_df, val_df): labels = [(x["mask_pxl"] == 0) * 1 for x in train_df] sampler = BalanceClassSampler(labels, mode="upsampling") train_loader = UtilsFactory.create_loader(train_df, open_fn=self.get_input_pair, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=sampler is None, sampler=sampler) labels = [(x["mask_pxl"] == 0) * 1 for x in val_df] sampler = BalanceClassSampler(labels, mode="upsampling") valid_loader = UtilsFactory.create_loader(val_df, open_fn=self.get_input_pair, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=sampler is None, sampler=sampler) loaders = collections.OrderedDict() loaders['train'] = train_loader loaders['valid'] = valid_loader return loaders
def prepare_loaders(*, mode: str, stage: str = None, n_workers: int = None, batch_size: int = None, datapath=None, in_csv=None, in_csv_train=None, in_csv_valid=None, in_csv_infer=None, train_folds=None, valid_folds=None, tag2class=None, class_column=None, tag_column=None, folds_seed=42, n_folds=5): loaders = collections.OrderedDict() df, df_train, df_valid, df_infer = parse_in_csvs( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, folds_seed=folds_seed, n_folds=n_folds) open_fn = [ ImageReader(input_key="filepath", output_key="image", datapath=datapath), ScalarReader(input_key="class", output_key="targets", default_value=-1, dtype=np.int64) ] open_fn = ReaderCompose(readers=open_fn) if len(df_train) > 0: labels = [x["class"] for x in df_train] sampler = BalanceClassSampler(labels, mode="upsampling") train_loader = UtilsFactory.create_loader( data_source=df_train, open_fn=open_fn, dict_transform=DataSource.prepare_transforms(mode="train", stage=stage), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=sampler is None, sampler=sampler) print("Train samples", len(train_loader) * batch_size) print("Train batches", len(train_loader)) loaders["train"] = train_loader if len(df_valid) > 0: sampler = None valid_loader = UtilsFactory.create_loader( data_source=df_valid, open_fn=open_fn, dict_transform=DataSource.prepare_transforms(mode="valid", stage=stage), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=False, sampler=sampler) print("Valid samples", len(valid_loader) * batch_size) print("Valid batches", len(valid_loader)) loaders["valid"] = valid_loader if len(df_infer) > 0: infer_loader = UtilsFactory.create_loader( data_source=df_infer, open_fn=open_fn, dict_transform=DataSource.prepare_transforms(mode="infer", stage=None), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=False, sampler=None) print("Infer samples", len(infer_loader) * batch_size) print("Infer batches", len(infer_loader)) loaders["infer"] = infer_loader return loaders
def prepare_loaders( *, mode: str, stage: str = None, n_workers: int = None, batch_size: int = None, datapath=None, in_csv=None, in_csv_train=None, in_csv_valid=None, in_csv_infer=None, train_folds=None, valid_folds=None, tag2class=None, class_column=None, tag_column=None, folds_seed=42, n_folds=5): loaders = collections.OrderedDict() df, df_train, df_valid, df_infer = parse_in_csvs( in_csv=in_csv, in_csv_train=in_csv_train, in_csv_valid=in_csv_valid, in_csv_infer=in_csv_infer, train_folds=train_folds, valid_folds=valid_folds, tag2class=tag2class, class_column=class_column, tag_column=tag_column, folds_seed=folds_seed, n_folds=n_folds) open_fn = [ ImageReader( row_key="filepath", dict_key="image", datapath=datapath), ScalarReader( row_key="class", dict_key="targets", default_value=-1, dtype=np.int64) ] open_fn = ReaderCompose(readers=open_fn) if len(df_train) > 0: labels = [x["class"] for x in df_train] sampler = BalanceClassSampler(labels, mode="upsampling") train_loader = UtilsFactory.create_loader( data_source=df_train, open_fn=open_fn, dict_transform=DataSource.prepare_transforms( mode="train", stage=stage), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=sampler is None, sampler=sampler) print("Train samples", len(train_loader) * batch_size) print("Train batches", len(train_loader)) loaders["train"] = train_loader if len(df_valid) > 0: sampler = None valid_loader = UtilsFactory.create_loader( data_source=df_valid, open_fn=open_fn, dict_transform=DataSource.prepare_transforms( mode="valid", stage=stage), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=False, sampler=sampler) print("Valid samples", len(valid_loader) * batch_size) print("Valid batches", len(valid_loader)) loaders["valid"] = valid_loader if len(df_infer) > 0: infer_loader = UtilsFactory.create_loader( data_source=df_infer, open_fn=open_fn, dict_transform=DataSource.prepare_transforms( mode="infer", stage=None), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=False, sampler=None) print("Infer samples", len(infer_loader) * batch_size) print("Infer batches", len(infer_loader)) loaders["infer"] = infer_loader return loaders
def prepare_loaders( *, mode, stage=None, n_workers=None, batch_size=None, train_folder=None, # all train data, folder with files like 00fj49fd.jpg [.pth] train_csv=None, # csv with whale ids train_ext=".jpg", # replace extension of train files with train_ext if needed infer_folder=None, # all test images, if None - dont create infer loader folds_seed=42, n_folds=5, train_folds=None, valid_folds=None, ): loaders = collections.OrderedDict() all_list, train_list, valid_list = parse_train_csv( train_csv, train_ext=train_ext, folds_seed=folds_seed, n_folds=n_folds, train_folds=train_folds, valid_folds=valid_folds) train_len = len(train_list) train_labels = [x["Id"] for x in train_list] train_idxs = list(range(train_len)) valid_len = len(valid_list) valid_labels = [x["Id"] for x in valid_list] valid_idxs = list(range(train_len, train_len + valid_len)) # train on train-train samples if train_len > 0: sampler = SiameseSampler( mode="train", train_idxs=train_idxs, train_labels=train_labels, size=train_len, ) loader = UtilsFactory.create_loader( data_source=np.array(train_list), # wrap in ndarray to enable indexing with list open_fn=SiameseDataSource._get_train_open_fn(train_folder), dict_transform=SiameseDataSource.prepare_transforms( mode="train", stage=stage), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=False, sampler=sampler, ) print("train samples:", len(loader) * batch_size) print("train batches:", len(loader)) loaders["train"] = loader if len(valid_list) > 0: sampler = SiameseSampler( mode="valid", train_idxs=train_idxs, train_labels=train_labels, valid_idxs=valid_idxs, valid_labels=valid_labels, size=valid_len, ) loader = UtilsFactory.create_loader( data_source=np.array(train_list + valid_list), # wrap in ndarray to enable indexing with list open_fn=SiameseDataSource._get_train_open_fn(train_folder), dict_transform=SiameseDataSource.prepare_transforms( mode="valid", stage=stage), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=False, sampler=sampler, ) print("valid samples:", len(loader) * batch_size) print("valid batches:", len(loader)) loaders["valid"] = loader if infer_folder is not None: infer_list = parse_infer_folder(infer_folder) all_labels = [x["Id"] for x in all_list] all_len = len(all_list) infer_len = len(infer_list) sampler = SiameseSampler( mode="infer", train_idxs=list(range(all_len)), infer_idxs=list(range(all_len, all_len + infer_len)) ) loader = UtilsFactory.create_loader( data_source=np.array(all_list + infer_list), open_fn=SiameseDataSource._get_infer_open_fn(train_folder, infer_folder), dict_transform=SiameseDataSource.prepare_transforms( mode="infer", stage=stage), dataset_cache_prob=-1, batch_size=batch_size, workers=n_workers, shuffle=False, sampler=sampler, ) print("infer samples:", len(loader) * batch_size) print("infer batches:", len(loader)) loaders["infer"] = loader return loaders