예제 #1
0
def main(args, _=None):
    global IMG_SIZE

    IMG_SIZE = (args.img_size, args.img_size)

    model = ResnetEncoder(arch=args.arch, pooling=args.pooling)
    model = model.eval()
    model, device = UtilsFactory.prepare_model(model)

    images_df = pd.read_csv(args.in_csv)
    images_df = images_df.reset_index().drop("index", axis=1)
    images_df = list(images_df.to_dict("index").values())

    open_fn = ImageReader(
        input_key=args.img_col, output_key="image", datapath=args.datapath
    )

    dataloader = UtilsFactory.create_loader(
        images_df,
        open_fn,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        dict_transform=dict_transformer
    )

    features = []
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for batch in dataloader:
            features_ = model(batch["image"].to(device))
            features_ = features_.cpu().detach().numpy()
            features.append(features_)

    features = np.concatenate(features, axis=0)
    np.save(args.out_npy, features)
예제 #2
0
def main(args):
    model = models.__dict__[args.arch](pretrained=True)
    model = model.eval()
    model, device = UtilsFactory.prepare_model(model)

    labels = json.loads(open(args.labels).read())

    i2k = Images2Keywords(model, args.n_keywords, labels)

    images_df = pd.read_csv(args.in_csv)
    images_df = images_df.reset_index().drop("index", axis=1)
    images_df = list(images_df.to_dict("index").values())

    open_fn = ImageReader(input_key=args.img_col,
                          output_key="image",
                          datapath=args.datapath)

    dataloader = UtilsFactory.create_loader(images_df,
                                            open_fn,
                                            batch_size=args.batch_size,
                                            workers=args.n_workers,
                                            dict_transform=dict_transformer)

    keywords = []
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for batch in dataloader:
            keywords_batch = i2k(batch["image"].to(device))
            keywords += keywords_batch

    input_csv = pd.read_csv(args.in_csv)
    input_csv[args.keywords_col] = keywords
    input_csv.to_csv(args.out_csv, index=False)
예제 #3
0
def main(args):
    images_df = pd.read_csv(args.in_csv)
    images_df = images_df.reset_index().drop("index", axis=1)
    images_df = list(images_df.to_dict("index").values())

    if args.fasttext_model is not None:
        encode_fn = create_fasttext_encode_fn(args.fasttext_model,
                                              normalize=args.normalize)
    elif args.w2v_model is not None:
        encode_fn = create_gensim_encode_fn(args.w2v_model,
                                            sep=args.txt_sep,
                                            normalize=args.normalize)
    else:
        raise NotImplementedError

    open_fn = LambdaReader(input_key=args.txt_col,
                           output_key="txt",
                           encode_fn=encode_fn)

    dataloader = UtilsFactory.create_loader(images_df,
                                            open_fn,
                                            batch_size=args.batch_size,
                                            workers=args.n_workers)

    features = []
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    for batch in dataloader:
        features_ = batch["txt"]
        features.append(features_)

    features = np.concatenate(features, axis=0)
    np.save(args.out_npy, features)
예제 #4
0
    def create_loaders(self, train_df, val_df):
        train_loader = UtilsFactory.create_loader(train_df,
                                                  open_fn=self.get_input_pair,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers,
                                                  shuffle=True)

        valid_loader = UtilsFactory.create_loader(val_df,
                                                  open_fn=self.get_input_pair,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers,
                                                  shuffle=True)

        loaders = collections.OrderedDict()
        loaders['train'] = train_loader
        loaders['valid'] = valid_loader

        return loaders
예제 #5
0
    def create_test_loaders(self, test_df):
        test_loader = UtilsFactory.create_loader(test_df,
                                                 open_fn=self.get_input_pair,
                                                 batch_size=self.batch_size,
                                                 num_workers=self.num_workers,
                                                 shuffle=True)

        loaders = collections.OrderedDict()
        loaders['test'] = test_df
        return loaders
예제 #6
0
    def create_loaders(self, train_df, val_df):
        labels = [(x["mask_pxl"] == 0) * 1 for x in train_df]
        sampler = BalanceClassSampler(labels, mode="upsampling")
        train_loader = UtilsFactory.create_loader(train_df,
                                                  open_fn=self.get_input_pair,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers,
                                                  shuffle=sampler is None,
                                                  sampler=sampler)

        labels = [(x["mask_pxl"] == 0) * 1 for x in val_df]
        sampler = BalanceClassSampler(labels, mode="upsampling")
        valid_loader = UtilsFactory.create_loader(val_df,
                                                  open_fn=self.get_input_pair,
                                                  batch_size=self.batch_size,
                                                  num_workers=self.num_workers,
                                                  shuffle=sampler is None,
                                                  sampler=sampler)

        loaders = collections.OrderedDict()
        loaders['train'] = train_loader
        loaders['valid'] = valid_loader

        return loaders
예제 #7
0
    def prepare_loaders(*,
                        mode: str,
                        stage: str = None,
                        n_workers: int = None,
                        batch_size: int = None,
                        datapath=None,
                        in_csv=None,
                        in_csv_train=None,
                        in_csv_valid=None,
                        in_csv_infer=None,
                        train_folds=None,
                        valid_folds=None,
                        tag2class=None,
                        class_column=None,
                        tag_column=None,
                        folds_seed=42,
                        n_folds=5):
        loaders = collections.OrderedDict()

        df, df_train, df_valid, df_infer = parse_in_csvs(
            in_csv=in_csv,
            in_csv_train=in_csv_train,
            in_csv_valid=in_csv_valid,
            in_csv_infer=in_csv_infer,
            train_folds=train_folds,
            valid_folds=valid_folds,
            tag2class=tag2class,
            class_column=class_column,
            tag_column=tag_column,
            folds_seed=folds_seed,
            n_folds=n_folds)

        open_fn = [
            ImageReader(input_key="filepath",
                        output_key="image",
                        datapath=datapath),
            ScalarReader(input_key="class",
                         output_key="targets",
                         default_value=-1,
                         dtype=np.int64)
        ]
        open_fn = ReaderCompose(readers=open_fn)

        if len(df_train) > 0:
            labels = [x["class"] for x in df_train]
            sampler = BalanceClassSampler(labels, mode="upsampling")

            train_loader = UtilsFactory.create_loader(
                data_source=df_train,
                open_fn=open_fn,
                dict_transform=DataSource.prepare_transforms(mode="train",
                                                             stage=stage),
                dataset_cache_prob=-1,
                batch_size=batch_size,
                workers=n_workers,
                shuffle=sampler is None,
                sampler=sampler)

            print("Train samples", len(train_loader) * batch_size)
            print("Train batches", len(train_loader))
            loaders["train"] = train_loader

        if len(df_valid) > 0:
            sampler = None

            valid_loader = UtilsFactory.create_loader(
                data_source=df_valid,
                open_fn=open_fn,
                dict_transform=DataSource.prepare_transforms(mode="valid",
                                                             stage=stage),
                dataset_cache_prob=-1,
                batch_size=batch_size,
                workers=n_workers,
                shuffle=False,
                sampler=sampler)

            print("Valid samples", len(valid_loader) * batch_size)
            print("Valid batches", len(valid_loader))
            loaders["valid"] = valid_loader

        if len(df_infer) > 0:
            infer_loader = UtilsFactory.create_loader(
                data_source=df_infer,
                open_fn=open_fn,
                dict_transform=DataSource.prepare_transforms(mode="infer",
                                                             stage=None),
                dataset_cache_prob=-1,
                batch_size=batch_size,
                workers=n_workers,
                shuffle=False,
                sampler=None)

            print("Infer samples", len(infer_loader) * batch_size)
            print("Infer batches", len(infer_loader))
            loaders["infer"] = infer_loader

        return loaders
예제 #8
0
    def prepare_loaders(
            *,
            mode: str,
            stage: str = None,
            n_workers: int = None,
            batch_size: int = None,
            datapath=None,
            in_csv=None,
            in_csv_train=None, in_csv_valid=None, in_csv_infer=None,
            train_folds=None, valid_folds=None,
            tag2class=None, class_column=None, tag_column=None,
            folds_seed=42, n_folds=5):
        loaders = collections.OrderedDict()

        df, df_train, df_valid, df_infer = parse_in_csvs(
            in_csv=in_csv,
            in_csv_train=in_csv_train, in_csv_valid=in_csv_valid,
            in_csv_infer=in_csv_infer,
            train_folds=train_folds, valid_folds=valid_folds,
            tag2class=tag2class,
            class_column=class_column, tag_column=tag_column,
            folds_seed=folds_seed, n_folds=n_folds)

        open_fn = [
            ImageReader(
                row_key="filepath", dict_key="image",
                datapath=datapath),
            ScalarReader(
                row_key="class", dict_key="targets",
                default_value=-1, dtype=np.int64)
        ]
        open_fn = ReaderCompose(readers=open_fn)

        if len(df_train) > 0:
            labels = [x["class"] for x in df_train]
            sampler = BalanceClassSampler(labels, mode="upsampling")

            train_loader = UtilsFactory.create_loader(
                data_source=df_train,
                open_fn=open_fn,
                dict_transform=DataSource.prepare_transforms(
                    mode="train", stage=stage),
                dataset_cache_prob=-1,
                batch_size=batch_size,
                workers=n_workers,
                shuffle=sampler is None,
                sampler=sampler)

            print("Train samples", len(train_loader) * batch_size)
            print("Train batches", len(train_loader))
            loaders["train"] = train_loader

        if len(df_valid) > 0:
            sampler = None

            valid_loader = UtilsFactory.create_loader(
                data_source=df_valid,
                open_fn=open_fn,
                dict_transform=DataSource.prepare_transforms(
                    mode="valid", stage=stage),
                dataset_cache_prob=-1,
                batch_size=batch_size,
                workers=n_workers,
                shuffle=False,
                sampler=sampler)

            print("Valid samples", len(valid_loader) * batch_size)
            print("Valid batches", len(valid_loader))
            loaders["valid"] = valid_loader

        if len(df_infer) > 0:
            infer_loader = UtilsFactory.create_loader(
                data_source=df_infer,
                open_fn=open_fn,
                dict_transform=DataSource.prepare_transforms(
                    mode="infer", stage=None),
                dataset_cache_prob=-1,
                batch_size=batch_size,
                workers=n_workers,
                shuffle=False,
                sampler=None)

            print("Infer samples", len(infer_loader) * batch_size)
            print("Infer batches", len(infer_loader))
            loaders["infer"] = infer_loader

        return loaders
 def prepare_loaders(
     *, 
     mode, 
     stage=None, 
     n_workers=None, 
     batch_size=None,
     train_folder=None,  # all train data, folder with files like 00fj49fd.jpg [.pth]
     train_csv=None,  # csv with whale ids
     train_ext=".jpg",  # replace extension of train files with train_ext if needed
     infer_folder=None,  # all test images, if None - dont create infer loader
     folds_seed=42, n_folds=5,
     train_folds=None, valid_folds=None,
 ):
     loaders = collections.OrderedDict()
     
     all_list, train_list, valid_list = parse_train_csv(
         train_csv,
         train_ext=train_ext,
         folds_seed=folds_seed, 
         n_folds=n_folds, 
         train_folds=train_folds, 
         valid_folds=valid_folds)
     
     train_len = len(train_list)
     train_labels = [x["Id"] for x in train_list]
     train_idxs = list(range(train_len))
     
     valid_len = len(valid_list)
     valid_labels = [x["Id"] for x in valid_list]
     valid_idxs = list(range(train_len, train_len + valid_len))
             
     # train on train-train samples
     if train_len > 0:
         sampler = SiameseSampler(
             mode="train", 
             train_idxs=train_idxs,
             train_labels=train_labels,
             size=train_len,
         )
         loader = UtilsFactory.create_loader(
             data_source=np.array(train_list),  # wrap in ndarray to enable indexing with list
             open_fn=SiameseDataSource._get_train_open_fn(train_folder),
             dict_transform=SiameseDataSource.prepare_transforms(
                 mode="train", stage=stage),
             dataset_cache_prob=-1,
             batch_size=batch_size,
             workers=n_workers,
             shuffle=False,
             sampler=sampler,
         )
         print("train samples:", len(loader) * batch_size)
         print("train batches:", len(loader))
         loaders["train"] = loader
     
     if len(valid_list) > 0:
         sampler = SiameseSampler(
             mode="valid",
             train_idxs=train_idxs,
             train_labels=train_labels,
             valid_idxs=valid_idxs,
             valid_labels=valid_labels,
             size=valid_len,
         )
         loader = UtilsFactory.create_loader(
             data_source=np.array(train_list + valid_list),  # wrap in ndarray to enable indexing with list
             open_fn=SiameseDataSource._get_train_open_fn(train_folder),
             dict_transform=SiameseDataSource.prepare_transforms(
                 mode="valid", stage=stage),
             dataset_cache_prob=-1,
             batch_size=batch_size,
             workers=n_workers,
             shuffle=False,
             sampler=sampler,
         )
         print("valid samples:", len(loader) * batch_size)
         print("valid batches:", len(loader))
         loaders["valid"] = loader
     
     if infer_folder is not None:
         infer_list = parse_infer_folder(infer_folder)
         all_labels = [x["Id"] for x in all_list]
         all_len = len(all_list)
         infer_len = len(infer_list)
         sampler = SiameseSampler(
             mode="infer", 
             train_idxs=list(range(all_len)),
             infer_idxs=list(range(all_len, all_len + infer_len))
         )
         loader = UtilsFactory.create_loader(
             data_source=np.array(all_list + infer_list),
             open_fn=SiameseDataSource._get_infer_open_fn(train_folder, infer_folder),
             dict_transform=SiameseDataSource.prepare_transforms(
                 mode="infer", stage=stage),
             dataset_cache_prob=-1,
             batch_size=batch_size,
             workers=n_workers,
             shuffle=False,
             sampler=sampler,
         )
         print("infer samples:", len(loader) * batch_size)
         print("infer batches:", len(loader))
         loaders["infer"] = loader
         
     return loaders