Ejemplo n.º 1
0
    def _get_experiment_components(
        self,
        stage: str = None
    ) -> Tuple[_Model, _Criterion, _Optimizer, _Scheduler, torch.device]:
        """
        Inner method for children's classes for model specific initialization.
        As baseline, checks device support and puts model on it.
        :return:
        """

        utils.set_global_seed(self.experiment.initial_seed)
        model = self.experiment.get_model(stage)
        criterion, optimizer, scheduler = \
            self.experiment.get_experiment_components(model, stage)

        model, criterion, optimizer, scheduler, device = \
            utils.process_components(
                model=model,
                criterion=criterion,
                optimizer=optimizer,
                scheduler=scheduler,
                distributed_params=self.experiment.distributed_params
            )

        return model, criterion, optimizer, scheduler, device
Ejemplo n.º 2
0
def main(args, _=None):
    global IMG_SIZE

    IMG_SIZE = (args.img_size, args.img_size)

    model = ResnetEncoder(arch=args.arch, pooling=args.pooling)
    model = model.eval()
    model, _, _, _, device = utils.process_components(model=model)

    images_df = pd.read_csv(args.in_csv)
    images_df = images_df.reset_index().drop("index", axis=1)
    images_df = list(images_df.to_dict("index").values())

    open_fn = ImageReader(input_key=args.img_col,
                          output_key="image",
                          datapath=args.datapath)

    dataloader = utils.get_loader(images_df,
                                  open_fn,
                                  batch_size=args.batch_size,
                                  num_workers=args.num_workers,
                                  dict_transform=dict_transformer)

    features = []
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for batch in dataloader:
            features_ = model(batch["image"].to(device))
            features_ = features_.cpu().detach().numpy()
            features.append(features_)

    features = np.concatenate(features, axis=0)
    np.save(args.out_npy, features)
Ejemplo n.º 3
0
    def predict_loader(
        self,
        *,
        loader: DataLoader,
        model: Model = None,
        resume: str = None,
        fp16: Union[Dict, bool] = None,
        initial_seed: int = 42,
    ) -> Generator:
        """
        Runs model inference on PyTorch Dataloader and returns
        python generator with model predictions from `runner.predict_batch`.
        Cleans up the experiment info to avoid possible collisions.
        Sets `is_train_loader` and `is_valid_loader` to `False` while
        keeping `is_infer_loader` as True. Moves model to evaluation mode.

        Args:
            loader: loader to predict
            model: model to use for prediction
            resume: path to checkpoint to resume
            fp16 (Union[Dict, bool]): fp16 usage flag
            initial_seed: seed to use before prediction

        Yields:
            bathes with model predictions
        """
        if isinstance(fp16, bool) and fp16:
            fp16 = {"opt_level": "O1"}

        if model is not None:
            self.model = model
        assert self.model is not None

        if resume is not None:
            checkpoint = utils.load_checkpoint(resume)
            utils.unpack_checkpoint(checkpoint, model=self.model)

        self.experiment = None
        utils.set_global_seed(initial_seed)
        (model, _, _, _, device) = utils.process_components(  # noqa: WPS122
            model=self.model,
            distributed_params=fp16,
            device=self.device,
        )
        self._prepare_inner_state(
            stage="infer",
            model=model,
            device=device,
            is_train_loader=False,
            is_valid_loader=False,
            is_infer_loader=True,
        )
        utils.maybe_recursive_call(self.model, "train", mode=False)

        utils.set_global_seed(initial_seed)
        for batch in loader:
            yield self.predict_batch(batch)
Ejemplo n.º 4
0
    def predict_loader(
        self,
        *,
        loader: DataLoader,
        model: Model = None,
        resume: str = None,
        fp16: Union[Dict, bool] = None,
        initial_seed: int = 42,
    ) -> Generator:
        """
        Runs model inference on PyTorch Dataloader and returns
        python Generator with model predictions from `runner.predict_batch`

        Args:
            loader (DataLoader): loader to predict
            model (Model): model to use for prediction
            resume (str): path to checkpoint to resume
            fp16 (Union[Dict, bool]): fp16 usage flag
            initial_seed (int): seed to use before prediction

        Yields:
            bathes with model predictions
        """
        if isinstance(fp16, bool) and fp16:
            fp16 = {"opt_level": "O1"}

        if model is not None:
            self.model = model
        assert self.model is not None

        if resume is not None:
            checkpoint = utils.load_checkpoint(resume)
            utils.unpack_checkpoint(checkpoint, model=self.model)

        (  # noqa: WPS122
            self.model,
            _,
            _,
            _,
            self.device,
        ) = utils.process_components(
            model=self.model,
            distributed_params=fp16,
            device=self.device,
        )

        utils.set_global_seed(initial_seed)
        for batch in loader:
            yield self.predict_batch(batch)
Ejemplo n.º 5
0
def main(args, _=None):
    """Run the ``catalyst-data image2embeddings`` script."""
    global IMG_SIZE

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    IMG_SIZE = (args.img_size, args.img_size)  # noqa: WPS442

    if args.traced_model is not None:
        device = utils.get_device()
        model = torch.jit.load(str(args.traced_model), map_location=device)
    else:
        model = ResnetEncoder(arch=args.arch, pooling=args.pooling)
        model = model.eval()
        model, _, _, _, device = utils.process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())

    open_fn = ImageReader(input_key=args.img_col,
                          output_key="image",
                          rootpath=args.rootpath)

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        dict_transform=dict_transformer,
    )

    features = []
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for batch in dataloader:
            batch_features = model(batch["image"].to(device))
            batch_features = batch_features.cpu().detach().numpy()
            features.append(batch_features)

    features = np.concatenate(features, axis=0)
    np.save(args.out_npy, features)
Ejemplo n.º 6
0
def main():
    args = get_args()
    image_paths = sorted(args.input_path.glob("*.jpg"))

    config = py2cfg(args.config_path)
    args.output_path.mkdir(exist_ok=True, parents=True)

    if args.visualize:
        vis_output_path = Path(str(args.output_path) + "_vis")
        vis_output_path.mkdir(exist_ok=True, parents=True)

    test_aug = config.test_augmentations

    model = config.model

    checkpoint = load_checkpoint(args.checkpoint, {
        "model.0.": "",
        "model.": ""
    })
    utils.unpack_checkpoint(checkpoint, model=model)

    model = nn.Sequential(model, ApplySoftmaxToLogits())

    model, _, _, _, device = utils.process_components(model=model)

    if args.tta == "lr":
        model = TTAWrapper(model, fliplr_image2mask)
    elif args.tta == "d4":
        model = TTAWrapper(model, d4_image2mask)

    runner = SupervisedRunner(model=model, device=device)

    with torch.no_grad():
        test_loader = DataLoader(
            TestSegmentationDataset(image_paths,
                                    test_aug,
                                    factor=config.pad_factor,
                                    imread_lib=config.imread_library),
            batch_size=args.batch_size,
            num_workers=args.num_workers,
            pin_memory=True,
            drop_last=False,
        )

        for input_images in tqdm(test_loader):
            raw_predictions = runner.predict_batch(
                {"features": input_images["features"].cuda()})["logits"]

            image_height, image_width = input_images["features"].shape[2:]

            pads = input_images["pads"].cpu().numpy()

            image_ids = input_images["image_id"]

            _, predictions = raw_predictions.max(1)

            for i in range(raw_predictions.shape[0]):
                unpadded_mask = predictions[i].cpu().numpy()

                if unpadded_mask.shape != (image_height, image_width):
                    unpadded_mask = cv2.resize(unpadded_mask,
                                               (image_width, image_height),
                                               interpolation=cv2.INTER_NEAREST)

                mask = unpad(unpadded_mask, pads[i]).astype(np.uint8)

                mask_name = image_ids[i] + ".png"
                cv2.imwrite(str(args.output_path / mask_name), mask)
                if args.visualize:
                    factor = 255 // config.num_classes
                    cv2.imwrite(str(vis_output_path / mask_name),
                                mask * factor)
Ejemplo n.º 7
0
def main(args, _=None):
    """Run the ``catalyst-data text2embeddings`` script."""
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    if hasattr(args, "in_huggingface"):
        model_config = BertConfig.from_pretrained(args.in_huggingface)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel.from_pretrained(args.in_huggingface,
                                          config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_huggingface)
    else:
        model_config = BertConfig.from_pretrained(args.in_config)
        model_config.output_hidden_states = args.output_hidden_states
        model = BertModel(config=model_config)
        tokenizer = BertTokenizer.from_pretrained(args.in_vocab)
    if hasattr(args, "in_model"):
        checkpoint = utils.load_checkpoint(args.in_model)
        checkpoint = {"model_state_dict": checkpoint}
        utils.unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = utils.process_components(model=model)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=partial(
            tokenize_text,
            strip=args.strip,
            lowercase=args.lowercase,
            remove_punctuation=args.remove_punctuation,
        ),
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    features = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            batch = utils.any2device(batch, device)
            bert_output = model(**batch)
            mask = (batch["attention_mask"].unsqueeze(-1)
                    if args.mask_for_max_length else None)

            if utils.check_ddp_wrapped(model):
                # using several gpu
                hidden_size = model.module.config.hidden_size
                hidden_states = model.module.config.output_hidden_states

            else:
                # using cpu or one gpu
                hidden_size = model.config.hidden_size
                hidden_states = model.config.output_hidden_states

            features_ = process_bert_output(
                bert_output=bert_output,
                hidden_size=hidden_size,
                output_hidden_states=hidden_states,
                pooling_groups=pooling_groups,
                mask=mask,
            )

            # create storage based on network output
            if idx == 0:
                for key, value in features_.items():
                    name_ = key if isinstance(key, str) else f"{key:02d}"
                    _, embedding_size = value.shape
                    features[name_] = np.memmap(
                        f"{args.out_prefix}.{name_}.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(idx * batch_size,
                                min((idx + 1) * batch_size, num_samples))
            for key, value in features_.items():
                name_ = key if isinstance(key, str) else f"{key:02d}"
                features[name_][indices] = _detach(value)
Ejemplo n.º 8
0
def main(args, _=None):
    batch_size = args.batch_size
    num_workers = args.num_workers
    max_length = args.max_length
    pooling_groups = args.pooling.split(",")

    utils.set_global_seed(args.seed)
    utils.prepare_cudnn(args.deterministic, args.benchmark)

    model_config = BertConfig.from_pretrained(args.in_config)
    model_config.output_hidden_states = args.output_hidden_states
    model = BertModel(config=model_config)

    checkpoint = utils.load_checkpoint(args.in_model)
    checkpoint = {"model_state_dict": checkpoint}
    utils.unpack_checkpoint(checkpoint=checkpoint, model=model)

    model = model.eval()
    model, _, _, _, device = utils.process_components(model=model)

    tokenizer = BertTokenizer.from_pretrained(args.in_vocab)

    df = pd.read_csv(args.in_csv)
    df = df.dropna(subset=[args.txt_col])
    df.to_csv(f"{args.out_prefix}.df.csv", index=False)
    df = df.reset_index().drop("index", axis=1)
    df = list(df.to_dict("index").values())
    num_samples = len(df)

    open_fn = LambdaReader(
        input_key=args.txt_col,
        output_key=None,
        lambda_fn=get_features,
        tokenizer=tokenizer,
        max_length=max_length,
    )

    dataloader = utils.get_loader(
        df,
        open_fn,
        batch_size=batch_size,
        num_workers=num_workers,
    )

    features = {}
    poolings = {}
    dataloader = tqdm(dataloader) if args.verbose else dataloader
    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            batch = utils.any2device(batch, device)
            features_ = model(**batch)

            # create storage based on network output
            if idx == 0:
                # class
                _, embedding_size = features_[1].shape
                features["class"] = np.memmap(
                    f"{args.out_prefix}.class.npy",
                    dtype=np.float32,
                    mode="w+",
                    shape=(num_samples, embedding_size),
                )
                if args.output_hidden_states:
                    # all embeddings
                    for i, feature_ in enumerate(features_[2]):
                        name_ = f"embeddings_{i + 1:02d}"
                        _, _, embedding_size = feature_.shape
                        poolings[name_] = LamaPooling(
                            features_in=embedding_size,
                            groups=pooling_groups,
                        )
                        features[name_] = np.memmap(
                            f"{args.out_prefix}.{name_}.npy",
                            dtype=np.float32,
                            mode="w+",
                            shape=(num_samples, embedding_size),
                        )
                else:
                    # last
                    _, _, embedding_size = features_[0].shape
                    poolings["last"] = LamaPooling(
                        features_in=embedding_size,
                        groups=pooling_groups,
                    )
                    features["last"] = np.memmap(
                        f"{args.out_prefix}.last.npy",
                        dtype=np.float32,
                        mode="w+",
                        shape=(num_samples, embedding_size),
                    )

            indices = np.arange(idx * batch_size,
                                min((idx + 1) * batch_size, num_samples))
            features["class"][indices] = _detach(features_[1])
            if args.output_hidden_states:
                # all embeddings
                for i, feature_ in enumerate(features_[2]):
                    name_ = f"embeddings_{i + 1:02d}"
                    feature_ = poolings[name_](feature_)
                    features[name_][indices] = _detach(feature_)
            else:
                feature_ = poolings[name_](features_[0])
                features["last"][indices] = _detach(feature_)