Beispiel #1
0
def my_app(cfg: Config) -> None:
    print(OmegaConf.to_yaml(cfg))
    working_dir = get_original_cwd()
    print(f"Orig working directory    : {working_dir}")
    print(f"Current working directory : {os.getcwd()}")
    if EProgramMode.PrepareData in cfg.mode:
        logger.info(f"STAGE: PrepareData")
        prepare_data(
            cfg.prepare_data,
            to_absolute_path(cfg.train_dataset),
            to_absolute_path(cfg.test_dataset),
            to_absolute_path(cfg.vocab_path),
        )
    if EProgramMode.Train in cfg.mode:
        logger.info(f"STAGE: Train")
        main_train_model(cfg)
    if EProgramMode.Predict in cfg.mode:
        logger.info(f"STAGE: Train")
        predict_model(cfg)
    if EProgramMode.Visualize in cfg.mode:
        run_visualization(
            to_absolute_path(cfg.prepare_data.train_file),
            cfg.train.report_path,
            cfg.train.dump_model,
        )
Beispiel #2
0
def prepare_data(args: PrepareDataArgs, train_output, test_output, vocab_output):
    """Runs data processing scripts to turn raw data from (../raw) into
    cleaned data ready to be analyzed (saved in ../processed).
    """
    logger.info("Processing raw data to final data set")
    train_df = load_raw_csv(to_absolute_path(args.train_file))
    test_df = load_raw_csv(to_absolute_path(args.test_file))

    label_encoder = LabelEncoder()
    label_encoder.fit(train_df.label.values)

    tokenizer = get_tokenizer(args.tokenizer_name)
    vocab = build_vocab(
        train_df.text.values,
        tokenizer,
        args.pretrained_vectors,
        to_absolute_path(args.vectors_cache_directory),
    )
    logger.info(f"Save vocab into {vocab_output}")
    torch.save(vocab, vocab_output)

    train_ds = make_dataset(
        train_df, label_encoder, transforms=None, tokenizer=tokenizer, vocab=vocab
    )
    save_dataset(train_ds, train_output)

    test_ds = make_dataset(
        test_df, label_encoder, transforms=None, tokenizer=tokenizer, vocab=vocab
    )
    save_dataset(test_ds, test_output)
    def setup(self, stage=None):
        super().setup(stage=stage)
        cfg = self.cfg

        if HydraConfig.initialized():
            # Current directory changed!
            # Update paths with original_cwd
            csv_path = to_absolute_path(cfg.data.csv_path)
            root_dir = to_absolute_path(cfg.data.root_dir)
        else:
            # Keep relative paths
            csv_path = cfg.data.csv_path
            root_dir = cfg.data.root_dir

        df = pd.read_csv(csv_path)
        df[["x", "y", "x1", "y1"]] = pd.DataFrame(
            np.stack(df["box"].apply(ast.literal_eval)).astype(np.float32)
        )
        train_df = df.loc[df["fold"] != cfg.data.valid_fold].copy()
        valid_df = df.loc[df["fold"] == cfg.data.valid_fold].copy()
        self.train_dataset = PennFudanDataset(
            train_df,
            root_dir=root_dir,
            transforms=self.train_transforms,
            mode="train",
        )
        self.val_dataset = PennFudanDataset(
            valid_df, root_dir=root_dir, transforms=self.val_transforms, mode="val"
        )
Beispiel #4
0
def main(cfg: DictConfig) -> None:
    # set up mlflow experiment id
    mlflow.set_tracking_uri(f"file://{to_absolute_path(cfg.path_to_mlflow)}")
    experiment = mlflow.get_experiment_by_name(cfg.experiment_name)
    if experiment is not None: # fetch existing experiment id
        run_kwargs = {'experiment_id': experiment.experiment_id} 
    else: # create new experiment
        experiment_id = mlflow.create_experiment(cfg.experiment_name)
        run_kwargs = {'experiment_id': experiment_id} 
    
    # run the training with mlflow tracking
    with mlflow.start_run(**run_kwargs) as active_run:
        setup_gpu(cfg.gpu_cfg)
        training_cfg = OmegaConf.to_object(cfg.training_cfg) # convert to python dictionary
        scaling_cfg = to_absolute_path(cfg.scaling_cfg)
        dataloader = DataLoaderReco.DataLoader(training_cfg, scaling_cfg)

        dl_config =  dataloader.config
        model = model = MyGNN(dl_config)
        input_shape, _  = dataloader.get_shape()
        # print(input_shape[0])
        # compile_build = tf.ones(input_shape[0], dtype=tf.float32, name=None)
        model.build(list(input_shape[0]))
        compile_model(model, dl_config["SetupNN"]["mode"], dl_config["SetupNN"]["learning_rate"])
        fit_hist = run_training(model, dataloader, False, cfg.log_suffix)

        mlflow.log_dict(training_cfg, 'input_cfg/training_cfg.yaml')
        mlflow.log_artifact(scaling_cfg, 'input_cfg')
        mlflow.log_artifact(to_absolute_path("Training_SNNv0.py"), 'input_cfg')
        mlflow.log_artifact(to_absolute_path("../commonReco.py"), 'input_cfg')
        mlflow.log_artifacts('.hydra', 'input_cfg/hydra')
        mlflow.log_artifact('Training_SNNv0.log', 'input_cfg/hydra')
        mlflow.log_param('run_id', active_run.info.run_id)
        print(f'\nTraining has finished! Corresponding MLflow experiment name (ID): {cfg.experiment_name}({run_kwargs["experiment_id"]}), and run ID: {active_run.info.run_id}\n')
Beispiel #5
0
def get_data_loaders(config):
    data_loaders = {}
    for phase in ["train_no_dev", "dev"]:
        in_dir = to_absolute_path(config.data[phase].in_dir)
        out_dir = to_absolute_path(config.data[phase].out_dir)
        train = phase.startswith("train")
        in_feats = FileSourceDataset(NpyFileSource(in_dir))
        out_feats = FileSourceDataset(NpyFileSource(out_dir))

        in_feats = MemoryCacheDataset(in_feats, cache_size=10000)
        out_feats = MemoryCacheDataset(out_feats, cache_size=10000)

        dataset = Dataset(in_feats, out_feats)
        data_loaders[phase] = data_utils.DataLoader(
            dataset,
            batch_size=config.data.batch_size,
            collate_fn=collate_fn,
            pin_memory=config.data.pin_memory,
            num_workers=config.data.num_workers,
            shuffle=train)

        for x, y, l in data_loaders[phase]:
            logger.info(f"{x.shape}, {y.shape}, {l.shape}")

    return data_loaders
Beispiel #6
0
def preprocess_dataset(cfg):
    in_dir = Path(utils.to_absolute_path(cfg.in_dir))
    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
    out_dir.mkdir(parents=True, exist_ok=True)

    executor = ProcessPoolExecutor(max_workers=cpu_count())
    print("Extracting features for train set")
    futures = []
    split_path = out_dir / "train"
    with open(split_path.with_suffix(".json")) as file:
        metadata = json.load(file)
        for in_path, out_path in metadata:
            wav_path = in_dir / in_path
            out_path = out_dir / out_path
            out_path.parent.mkdir(parents=True, exist_ok=True)
            futures.append(
                executor.submit(process_wav, wav_path, out_path,
                                cfg.preprocess))

    results = [future.result() for future in tqdm(futures)]

    lengths = {result[0].stem: result[1] for result in results}
    with open(out_dir / "lengths.json", "w") as file:
        json.dump(lengths, file, indent=4)

    frames = sum(lengths.values())
    frame_shift_ms = cfg.preprocess.hop_length / cfg.preprocess.sr
    hours = frames * frame_shift_ms / 3600
    print(
        f"Wrote {len(lengths)} utterances, {frames} frames ({hours:.2f} hours)"
    )
Beispiel #7
0
def my_app(config: DictConfig) -> None:
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    utt_list = to_absolute_path(config.utt_list)
    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)

    utt_ids = load_utt_list(utt_list)

    stream_sizes, has_dynamic_features = get_world_stream_info(
        config.acoustic.sample_rate,
        config.acoustic.mgc_order,
        config.acoustic.num_windows,
        config.acoustic.vibrato_mode,
    )

    os.makedirs(out_dir, exist_ok=True)
    with ProcessPoolExecutor(max_workers=config.max_workers) as executor:
        futures = [
            executor.submit(
                _extract_static_features,
                in_dir,
                out_dir,
                utt_id,
                config.acoustic.num_windows,
                stream_sizes,
                has_dynamic_features,
            ) for utt_id in utt_ids
        ]
        for future in tqdm(futures):
            future.result()
Beispiel #8
0
    def from_hydra(cfg):
        """Factory method.

        Instantiates :class:`TetrisScheduling` from a hydra configuration object.

        Args:
            cfg: a hydra configuration object
        """
        # Set the platform
        platform = hydra.utils.instantiate(cfg["platform"])

        # Read applications and mappings
        base_apps_dir = to_absolute_path(cfg["tetris_apps_dir"])
        apps = read_applications(base_apps_dir, platform)

        # Read jobs file
        reqs = read_requests(to_absolute_path(cfg["input_jobs"]), apps)

        # Initialize tetris scheduler
        scheduler = hydra.utils.instantiate(cfg["resource_manager"], platform)

        manager = ResourceManager(platform, scheduler)

        tracer = TracePlayer(manager, reqs)

        management = TetrisManagement(manager, tracer, reqs)
        return management
Beispiel #9
0
def my_app(_cfg):
    print("Current working directory  : {}".format(os.getcwd()))
    print("Original working directory : {}".format(utils.get_original_cwd()))
    print("to_absolute_path('foo')    : {}".format(
        utils.to_absolute_path("foo")))
    print("to_absolute_path('/foo')   : {}".format(
        utils.to_absolute_path("/foo")))
Beispiel #10
0
    def __init__(
        self,
        train_path: str,
        train_batch_size: int,
        train_num_workers: int,
        val_path: str,
        val_batch_size: int,
        val_num_workers: int,
        test_path: str,
        test_batch_size: int,
        test_num_workers: int,
        word_vocab_file: str,
    ) -> None:
        super().__init__()
        self.train_path = to_absolute_path(train_path)
        self.train_batch_size = train_batch_size
        self.train_num_workers = train_num_workers
        self.val_path = to_absolute_path(val_path)
        self.val_batch_size = val_batch_size
        self.val_num_workers = val_num_workers
        self.test_path = to_absolute_path(test_path)
        self.test_batch_size = test_batch_size
        self.test_num_workers = test_num_workers

        with open(to_absolute_path(word_vocab_file), "r") as f:
            word_vocab = [word.strip() for word in f.readlines()]
        self.preprocessor = SpacyPreprocessor(word_vocab)
Beispiel #11
0
def my_app(cfg: DictConfig) -> None:
    batch_size = 400
    train_data = get_mp4_data(to_absolute_path(cfg.dataset.train_mp4))
    train_labels = get_txt_data(to_absolute_path(cfg.dataset.train_txt))
    train_data = train_data[:batch_size]
    train_labels = train_labels[:batch_size]
    train(train_data, train_labels, batch_size)
Beispiel #12
0
def preprocess_dataset(cfg):
    in_dir = Path(utils.to_absolute_path(cfg.in_dir))
    out_dir = Path(utils.to_absolute_path("datasets")) / str(
        cfg.dataset.dataset)
    out_dir.mkdir(parents=True, exist_ok=True)

    executor = ProcessPoolExecutor(max_workers=cpu_count())
    for split in ["train", "test"]:
        print("Extracting features for {} set".format(split))
        futures = []
        split_path = out_dir / cfg.dataset.language / split
        with open(split_path.with_suffix(".json")) as file:
            metadata = json.load(file)
            for in_path, start, duration, out_path in metadata:
                wav_path = in_dir / in_path
                out_path = out_dir / out_path
                out_path.parent.mkdir(parents=True, exist_ok=True)
                futures.append(
                    executor.submit(
                        partial(process_wav,
                                wav_path,
                                out_path,
                                **cfg.preprocessing,
                                offset=start,
                                duration=duration)))

        results = [future.result() for future in tqdm(futures)]

        lengths = [x[-1] for x in results]
        frames = sum(lengths)
        frame_shift_ms = cfg.preprocessing.hop_length / cfg.preprocessing.sr
        hours = frames * frame_shift_ms / 3600
        print("Wrote {} utterances, {} frames ({:.2f} hours)".format(
            len(lengths), frames, hours))
Beispiel #13
0
def generate_report(config: ReportConfig):
    out_path = to_absolute_path(config.output_path)
    in_path = to_absolute_path(config.input_path)
    data = pd.read_csv(in_path)
    profile = ProfileReport(data, title="Profiling Report", explorative=True)

    logger.info("Save report to %s", out_path)
    check_dir(os.path.split(out_path)[0])
    profile.to_file(str(out_path))
Beispiel #14
0
def main(cfg):
    if cfg.wandb.project:
        import wandb
        from wandb.keras import WandbCallback
        wandb.init(project=cfg.wandb.project)
        callbacks = [WandbCallback()]
    else:
        callbacks = []

    csv_path = Path(to_absolute_path(__file__)).parent.joinpath(
        "meta", f"{cfg.data.db}.csv")
    df = pd.read_csv(str(csv_path))
    train, val = train_test_split(df, random_state=42, test_size=0.1)
    train_gen = ImageSequence(cfg, train, "train")
    val_gen = ImageSequence(cfg, val, "val")

    strategy = tf.distribute.MirroredStrategy()

    with strategy.scope():
        model = get_model(cfg)
        opt = get_optimizer(cfg)
        scheduler = get_scheduler(cfg)
        model.compile(optimizer=opt,
                      loss=[
                          "sparse_categorical_crossentropy",
                          "sparse_categorical_crossentropy"
                      ],
                      metrics=['accuracy'])

    dir_parent_checkpoint = "/content/drive/My Drive/deep_learning/age-gender-estimation"
    if os.path.exists(dir_parent_checkpoint):
        checkpoint_dir = Path(dir_parent_checkpoint).joinpath("checkpoint")
    else:
        checkpoint_dir = Path(
            to_absolute_path(__file__)).parent.joinpath("checkpoint")

    checkpoint_dir.mkdir(exist_ok=True)
    print(f"checkpoint_dir: {checkpoint_dir}")

    filename = "_".join([
        cfg.model.model_name,
        str(cfg.model.img_size), "weights.{epoch:02d}-{val_loss:.2f}.hdf5"
    ])
    callbacks.extend([
        LearningRateScheduler(schedule=scheduler),
        ModelCheckpoint(str(checkpoint_dir) + "/" + filename,
                        monitor="val_loss",
                        verbose=1,
                        save_best_only=True,
                        mode="auto")
    ])

    model.fit(train_gen,
              epochs=cfg.train.epochs,
              callbacks=callbacks,
              validation_data=val_gen,
              workers=multiprocessing.cpu_count())
Beispiel #15
0
def my_app(config: Configuration) -> None:

    # create the model
    model = nn.Sequential(
        K.contrib.VisionTransformer(image_size=32, patch_size=16, embed_dim=128, num_heads=3),
        K.contrib.ClassificationHead(embed_size=128, num_classes=10),
    )

    # create the dataset
    train_dataset = torchvision.datasets.CIFAR10(
        root=to_absolute_path(config.data_path), train=True, download=True, transform=T.ToTensor())

    valid_dataset = torchvision.datasets.CIFAR10(
        root=to_absolute_path(config.data_path), train=False, download=True, transform=T.ToTensor())

    # create the dataloaders
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=8, pin_memory=True)

    valid_daloader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.batch_size, shuffle=True, num_workers=8, pin_memory=True)

    # create the loss function
    criterion = nn.CrossEntropyLoss()

    # instantiate the optimizer and scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, config.num_epochs * len(train_dataloader))

    # define some augmentations
    _augmentations = nn.Sequential(
        K.augmentation.RandomHorizontalFlip(p=0.75),
        K.augmentation.RandomVerticalFlip(p=0.75),
        K.augmentation.RandomAffine(degrees=10.),
        K.augmentation.PatchSequential(
            K.augmentation.ColorJitter(0.1, 0.1, 0.1, 0.1, p=0.8),
            grid_size=(2, 2),  # cifar-10 is 32x32 and vit is patch 16
            patchwise_apply=False,
        ),
    )

    def augmentations(self, sample: dict) -> dict:
        out = _augmentations(sample["input"])
        return {"input": out, "target": sample["target"]}

    model_checkpoint = ModelCheckpoint(
        filepath="./outputs", monitor="top5",
    )

    trainer = ImageClassifierTrainer(
        model, train_dataloader, valid_daloader, criterion, optimizer, scheduler, config,
        callbacks={
            "augmentations": augmentations, "on_checkpoint": model_checkpoint,
        }
    )
    trainer.fit()
Beispiel #16
0
def segment(cfg):

    # Algorithm
    segment_func = getattr(wordseg_algorithms, cfg.wordseg_algorithm.function)

    # Directories
    phoneseg_dir = (
        Path(utils.to_absolute_path("exp"))/cfg.model/cfg.dataset/cfg.split/
        cfg.phoneseg_tag/"intervals"
        )

    # Read phone intervals
    phoneseg_interval_dict = {}
    print("Reading: {}".format(phoneseg_dir))
    phoneseg_interval_dict = get_intervals_from_dir(phoneseg_dir)
    utterances = phoneseg_interval_dict.keys()

    # Segmentation
    print("Segmenting:")
    prepared_text = []
    for utt_key in utterances:
        prepared_text.append(
            " ".join([i[2] + "_" for i in phoneseg_interval_dict[utt_key]])
            )
    kwargs = dict(cfg.wordseg_algorithm)
    kwargs.pop("function")
    word_segmentation = segment_func(prepared_text, **kwargs)
    # print(prepared_text[:10])
    wordseg_interval_dict = {}
    for i_utt, utt_key in tqdm(enumerate(utterances)):
        words_segmented = word_segmentation[i_utt].split(" ")
        word_start = 0
        word_label = ""
        i_word = 0
        wordseg_interval_dict[utt_key] = []
        for (phone_start,
                phone_end, phone_label) in phoneseg_interval_dict[utt_key]:
            word_label += phone_label + "_"
            if words_segmented[i_word] == word_label:
                wordseg_interval_dict[utt_key].append((
                    word_start, phone_end, word_label
                    ))
                word_label = ""
                word_start = phone_end
                i_word += 1

    # Write intervals
    output_dir = (
        Path(utils.to_absolute_path("exp"))/cfg.model/cfg.dataset/cfg.split/
        cfg.output_tag/"intervals"
        )
    output_dir.mkdir(exist_ok=True, parents=True)
    print("Writing to: {}".format(output_dir))
    for utt_key in tqdm(wordseg_interval_dict):
        with open((output_dir/utt_key).with_suffix(".txt"), "w") as f:
            for start, end, label in wordseg_interval_dict[utt_key]:
                f.write("{:d} {:d} {}\n".format(start, end, label))
Beispiel #17
0
def get_training_dataset(cfg: DictConfig = None) -> Dict[str, Dataset]:
    """
    Get training and validation datasets.

    Parameters
    ----------
    cfg : DictConfig, optional
        Project configuration, by default None

    Returns
    -------
    Dict[str, Dataset]
        {"train": train_dataset, "valid": valid_dataset}
    """
    images_dir = to_absolute_path(cfg.data.images_folder_path)

    data = pd.read_csv(to_absolute_path(cfg.data.dataset_path))
    data["x1"] = data["x"] + data["w"]
    data["y1"] = data["y"] + data["h"]
    data["area"] = data["w"] * data["h"]

    train_ids, valid_ids = train_test_split(
        data["image_id"].unique(),
        test_size=cfg.data.validation_split,
        random_state=cfg.training.seed,
    )

    # for fast training
    if cfg.training.debug:
        train_ids = train_ids[:10]
        valid_ids = valid_ids[:10]

    train_df = data.loc[data["image_id"].isin(train_ids)]
    valid_df = data.loc[data["image_id"].isin(valid_ids)]

    train_augs_list = [
        load_obj(i["class_name"])(**i["params"])
        for i in cfg["augmentation"]["train"]["augs"]
    ]
    train_bbox_params = OmegaConf.to_container(
        (cfg["augmentation"]["train"]["bbox_params"])
    )
    train_augs = Compose(train_augs_list, bbox_params=train_bbox_params)

    valid_augs_list = [
        load_obj(i["class_name"])(**i["params"])
        for i in cfg["augmentation"]["valid"]["augs"]
    ]
    valid_bbox_params = OmegaConf.to_container(
        (cfg["augmentation"]["valid"]["bbox_params"])
    )
    valid_augs = Compose(valid_augs_list, bbox_params=valid_bbox_params)

    train_dataset = XrayDataset(train_df, "train", images_dir, cfg, train_augs)
    valid_dataset = XrayDataset(valid_df, "valid", images_dir, cfg, valid_augs)

    return {"train": train_dataset, "valid": valid_dataset}
Beispiel #18
0
def convert(cfg):
    dataset_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
    with open(dataset_path / "speakers.json") as file:
        speakers = sorted(json.load(file))

    synthesis_list_path = Path(utils.to_absolute_path(cfg.synthesis_list))
    with open(synthesis_list_path) as file:
        synthesis_list = json.load(file)

    in_dir = Path(utils.to_absolute_path(cfg.in_dir))
    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
    out_dir.mkdir(exist_ok=True, parents=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(**cfg.model.encoder)
    decoder = Decoder(**cfg.model.decoder)
    encoder.to(device)
    decoder.to(device)

    print("Load checkpoint from: {}:".format(cfg.checkpoint))
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])

    encoder.eval()
    decoder.eval()

    for wav_path, speaker_id, out_filename in tqdm(synthesis_list):
        wav_path = in_dir / wav_path
        wav, _ = librosa.load(
            wav_path.with_suffix(".wav"),
            sr=cfg.preprocessing.sr)
        wav = wav / np.abs(wav).max() * 0.999

        mel = librosa.feature.melspectrogram(
            preemphasis(wav, cfg.preprocessing.preemph),
            sr=cfg.preprocessing.sr,
            n_fft=cfg.preprocessing.n_fft,
            n_mels=cfg.preprocessing.n_mels,
            hop_length=cfg.preprocessing.hop_length,
            win_length=cfg.preprocessing.win_length,
            fmin=cfg.preprocessing.fmin,
            power=1)
        logmel = librosa.amplitude_to_db(mel, top_db=cfg.preprocessing.top_db)
        logmel = logmel / cfg.preprocessing.top_db + 1

        mel = torch.FloatTensor(logmel).unsqueeze(0).to(device)
        speaker = torch.LongTensor([speakers.index(speaker_id)]).to(device)
        with torch.no_grad():
            z, _ = encoder.encode(mel)
            output = decoder.generate(z, speaker)

        path = out_dir / out_filename
        librosa.output.write_wav(path.with_suffix(".wav"), output.astype(np.float32), sr=cfg.preprocessing.sr)
Beispiel #19
0
def run(cfg: DictConfig) -> None:
    """
    Run model inference on the entire test dataset.

    Parameters
    ----------
    cfg : DictConfig
        Project configuration object
    """
    device = torch.device("cuda")
    model = torch.load(
        to_absolute_path(cfg.logging.best_model_path), map_location=device
    )
    model.eval()

    detection_threshold = 0.5
    results = []

    test_dataset = get_test_dataset(cfg)
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=cfg.data.batch_size,
        num_workers=cfg.data.num_workers,
        shuffle=False,
        collate_fn=collate_fn,
    )

    for images, _, image_ids in tqdm(test_loader):

        images = list(image.to(device) for image in images)
        outputs = model(images)

        for i, image in enumerate(images):

            boxes = outputs[i]["boxes"].data.cpu().numpy()
            scores = outputs[i]["scores"].data.cpu().numpy()
            labels = outputs[i]["labels"].data.cpu().numpy()

            boxes = boxes[scores >= detection_threshold].astype(np.int32)
            labels = labels[scores >= detection_threshold]
            scores = scores[scores >= detection_threshold]

            image_id = image_ids[i]

            result = {
                "image_id": image_id,
                "predictions": format_prediction_string(boxes, scores),
                "labels": ",".join([str(x) for x in labels]),
            }

            results.append(result)

    test_df = pd.DataFrame(
        results, columns=["image_id", "predictions", "labels"]
    )
    test_df.to_csv(to_absolute_path("predictions.csv"), index=False)
Beispiel #20
0
def encode_dataset(cfg):
    out_dir = Path(utils.to_absolute_path(cfg.out_dir))
    out_dir.mkdir(exist_ok=True, parents=True)

    root_path = Path(utils.to_absolute_path("datasets")) / cfg.dataset.path
    with open(root_path / "test.json") as file:
        metadata = json.load(file)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    encoder = Encoder(**cfg.model.encoder)
    encoder.to(device)

    print("Load checkpoint from: {}:".format(cfg.checkpoint))
    checkpoint_path = utils.to_absolute_path(cfg.checkpoint)
    checkpoint = torch.load(checkpoint_path,
                            map_location=lambda storage, loc: storage)
    encoder.load_state_dict(checkpoint["encoder"])

    encoder.eval()

    if cfg.save_auxiliary:
        auxiliary = []

        def hook(module, input, output):
            auxiliary.append(output.clone())

        encoder.encoder[-1].register_forward_hook(hook)

    for _, _, _, path in tqdm(metadata):
        path = root_path.parent / path
        mel = torch.from_numpy(np.load(
            path.with_suffix(".mel.npy"))).unsqueeze(0).to(device)
        with torch.no_grad():
            z, c, indices = encoder.encode(mel)

        z = z.squeeze().cpu().numpy()

        out_path = out_dir / path.stem
        with open(out_path.with_suffix(".txt"), "w") as file:
            np.savetxt(file, z, fmt="%.16f")

        if cfg.save_auxiliary:
            aux_path = out_dir.parent / "auxiliary_embedding1"
            aux_path.mkdir(exist_ok=True, parents=True)
            out_path = aux_path / path.stem
            c = c.squeeze().cpu().numpy()
            with open(out_path.with_suffix(".txt"), "w") as file:
                np.savetxt(file, c, fmt="%.16f")

            aux_path = out_dir.parent / "auxiliary_embedding2"
            aux_path.mkdir(exist_ok=True, parents=True)
            out_path = aux_path / path.stem
            aux = auxiliary.pop().squeeze().cpu().numpy()
            with open(out_path.with_suffix(".txt"), "w") as file:
                np.savetxt(file, aux, fmt="%.16f")
def main_train_model(cfg: Config):
    ds = load_processed_dataset(to_absolute_path(cfg.train_dataset))
    vocab = torch.load(to_absolute_path(cfg.vocab_path))
    args = cfg.train
    train_dataset, val_dataset = ds.train_test_split(args.test_size)
    train_dataloader = make_text_dataloader(train_dataset, args.batch_size,
                                            vocab[PAD])
    val_dataloader = make_text_dataloader(train_dataset, args.batch_size,
                                          vocab[PAD])

    model = make_model(args.model, vocab)
    loss_fn = make_loss_fn(args.loss_fn)

    optimizer = make_optimizer(args.optimizer, model, args.learning_rate)
    device = get_device(args.gpu)

    score_functions = dict(
        accuracy=accuracy_from_predictions,
        recall=recall_from_predictions,
        precision=precision_from_predictions,
    )
    train_one_epoch = make_one_epoch_runner(
        args.model,
        model,
        loss_fn,
        train_dataloader,
        optimizer,
        device,
        is_train=True,
        score_functions=score_functions,
    )
    val_one_epoch = make_one_epoch_runner(
        args.model,
        model,
        loss_fn,
        val_dataloader,
        None,
        device,
        is_train=False,
        score_functions=score_functions,
    )

    lr_scheduler = make_lr_scheduler(optimizer, args.lr_scheduler)
    plot_fn = plot_train_val_loss if args.interactive else None
    results = train_cycle(
        model,
        train_one_epoch,
        val_one_epoch,
        args.epochs,
        ensure_path(args.dump_model),
        lr_scheduler,
        plot_fn,
    )
    save_train_report(results, ensure_path(args.report_path))
 def __init__(self, labels: list, data_cfg: DataConfig, normalize: bool,
              is_distributed: bool):
     super().__init__()
     self.train_path = to_absolute_path(data_cfg.train_path)
     self.val_path = to_absolute_path(data_cfg.val_path)
     self.labels = labels
     self.data_cfg = data_cfg
     self.spect_cfg = data_cfg.spect
     self.aug_cfg = data_cfg.augmentation
     self.normalize = normalize
     self.is_distributed = is_distributed
Beispiel #23
0
def compare_input(cfg, print_grid=False):
    assert (cfg.compare_input)
    path_to_artifacts = to_absolute_path(
        f'{cfg.path_to_mlflow}/{cfg.experiment_id}/{cfg.run_id}/artifacts/')
    file_cmssw_path = to_absolute_path(
        f'{cfg.path_to_input_dir}/{cfg.compare_input.input_cmssw}')
    files_cmssw_python = f'{path_to_artifacts}/predictions/{cfg.sample_alias}/{cfg.input_filename}_pred_input'

    with open(file_cmssw_path) as file:
        json_input = json.load(file)
        data_cmssw = {}
        for tensor_name in json_input.keys():
            data_cmssw[tensor_name] = np.array(json_input[tensor_name])

    event = cfg.compare_input.input_python.event
    index = cfg.compare_input.input_python.tau_index
    data_python = np.load(f'{files_cmssw_python}/tensor_{event}_{index}.npy',
                          allow_pickle=True)[()]

    for key in data_cmssw.keys():
        print("Shape consistency check:", data_cmssw[key].shape,
              data_python[key].shape)
        assert (data_cmssw[key].shape == data_python[key].shape)

    map_f = FeatureDecoder(cfg)
    for key in list(json_input.keys()):
        print("\n--------->", key, "<---------")
        delta = np.abs(data_cmssw[key] - data_python[key])
        if key == list(json_input.keys())[0]:
            print("cmssw tau:", data_cmssw[key])
            print("python tau:", data_python[key])
            f_idx = np.where(delta > epsilon)
            print(f"Inconsistent features:\n")
            for f in np.unique(f_idx):
                print(map_f.get(key, f))
        else:
            row_idx, col_idx, f_idx = np.where(delta > epsilon)
            if row_idx.size != 0:
                print("cmssw grid:",
                      data_cmssw[key][row_idx[0]][col_idx[0]][f_idx])
                print("python grid:",
                      data_python[key][row_idx[0]][col_idx[0]][f_idx])
            print(f"Inconsistent features:\n")
            for f in np.unique(f_idx):
                print(map_f.get(key, f))

        if print_grid and key != list(json_input.keys(
        ))[0]:  # if print & not first tensor (plane features)
            grid_idx = np.stack([row_idx, col_idx], axis=1).tolist()
            print("\nInconsistent cells:")
            print(
                grid(data_cmssw[key].shape[0], data_cmssw[key].shape[1],
                     grid_idx))
def predict_model(cfg: Config):
    test_dataset = load_processed_dataset(to_absolute_path(cfg.test_dataset))
    vocab = torch.load(to_absolute_path(cfg.vocab_path))
    args = cfg.train

    device = get_device(args.gpu)
    model = load_model(args.model, vocab, args.dump_model)
    model.eval()
    model.to(device)

    predictions = make_predictions(model, test_dataset, device)
    save_predictions(predictions, cfg.predict.result_file)
Beispiel #25
0
def my_app(config : DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(config.pretty())

    if use_cuda:
        from torch.backends import cudnn
        cudnn.benchmark = config.train.cudnn.benchmark
        cudnn.deterministic = config.train.cudnn.deterministic
        logger.info(f"cudnn.deterministic: {cudnn.deterministic}")
        logger.info(f"cudnn.benchmark: {cudnn.benchmark}")

    logger.info(f"Random seed: {config.seed}")
    init_seed(config.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    if config.train.use_detect_anomaly:
        torch.autograd.set_detect_anomaly(True)
        logger.info("Set to use torch.autograd.detect_anomaly")

    # Model
    model = hydra.utils.instantiate(config.model.netG).to(device)

    # Optimizer
    optimizer_class = getattr(optim, config.train.optim.optimizer.name)
    optimizer = optimizer_class(model.parameters(), **config.train.optim.optimizer.params)

    # Scheduler
    lr_scheduler_class = getattr(optim.lr_scheduler, config.train.optim.lr_scheduler.name)
    lr_scheduler = lr_scheduler_class(optimizer, **config.train.optim.lr_scheduler.params)

    data_loaders = get_data_loaders(config)

    # Resume
    if config.train.resume.checkpoint is not None and len(config.train.resume.checkpoint) > 0:
        logger.info("Load weights from {}".format(config.train.resume.checkpoint))
        checkpoint = torch.load(to_absolute_path(config.train.resume.checkpoint))
        model.load_state_dict(checkpoint["state_dict"])
        if config.train.resume.load_optimizer:
            logger.info("Load optimizer state")
            optimizer.load_state_dict(checkpoint["optimizer_state"])
            lr_scheduler.load_state_dict(checkpoint["lr_scheduler_state"])

    # Save model definition
    out_dir = to_absolute_path(config.train.out_dir)
    os.makedirs(out_dir, exist_ok=True)
    with open(join(out_dir, "model.yaml"), "w") as f:
        OmegaConf.save(config.model, f)

    # Run training loop
    train_loop(config, device, model, optimizer, lr_scheduler, data_loaders)
def main(cfg):
    if cfg.wandb.project:
        import wandb
        from wandb.keras import WandbCallback
        wandb.init(project=cfg.wandb.project)
        callbacks = [WandbCallback()]
    else:
        callbacks = []
    weight_file = cfg.train.weight_file

    csv_path = Path(to_absolute_path(__file__)).parent.joinpath("meta", f"{cfg.data.db}.csv")
    df = pd.read_csv(str(csv_path))
    train, val = train_test_split(df, random_state=42, test_size=0.2)
    train_gen = ImageSequence(cfg, train, "train")
    val_gen = ImageSequence(cfg, val, "val")

    strategy = tf.distribute.MirroredStrategy()
    initial_epoch = 0
    if weight_file:
        _, file_meta, *_ = weight_file.split('.')
        prev_epoch, new_epoch, _ = file_meta.split('-')
        initial_epoch = int(prev_epoch) + int(new_epoch)
    with strategy.scope():
        model = get_model(cfg)
        opt = get_optimizer(cfg)
        scheduler = get_scheduler(cfg, initial_epoch)
        model.compile(optimizer=opt,
                      loss=["sparse_categorical_crossentropy", "sparse_categorical_crossentropy"],
                      metrics=['accuracy'])
    if cfg.train.is_collab:
        checkpoint_dir = Path(to_absolute_path(__file__)).parent.parent.joinpath('drive', 'MyDrive', 'AgeGenderCheckpoint')
    else:
        checkpoint_dir = Path(to_absolute_path(__file__)).parent.joinpath('checkpoints')
    checkpoint_dir.mkdir(exist_ok=True)

    filename = "_".join([cfg.model.model_name,
                         str(cfg.model.img_size),
                         f"weights.{initial_epoch:02d}-" + "{epoch:02d}-{val_loss:.2f}.hdf5"])
    callbacks.extend([
        LearningRateScheduler(schedule=scheduler),
        get_logger(checkpoint_dir, initial_epoch, cfg.train.lr),
        ModelCheckpoint(str(checkpoint_dir) + "/" + filename,
                        monitor="val_loss",
                        verbose=1,
                        save_best_only=True,
                        mode="auto")
    ])

    if weight_file:
      model.load_weights(str(checkpoint_dir) + "/" + weight_file)
    model.fit(train_gen, epochs=cfg.train.epochs, callbacks=callbacks, validation_data=val_gen,
              workers=multiprocessing.cpu_count())
Beispiel #27
0
def fit_scalenet(cfg):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    writer = get_tensorboard_writer(cfg)

    # train and validation data
    train_data, eval_data = get_data_generators(cfg)

    # model
    model, optimizer = get_model(cfg)
    if cfg.load_from_checkpoint:
        load_dict = model.load(to_absolute_path(cfg.checkpoint_path))
        # optimizer.load_state_dict(load_dict['optimizer'])
        print(
            f'Loaded checkpoint from {to_absolute_path(cfg.checkpoint_path)}')
    model.to(device)
    model.summary()

    # fit
    train_loss = 0.
    train_accuracy = 0.
    for step in range(cfg.training_steps):
        x_train, y_train = get_data(train_data, device)
        pred = model(x_train)
        loss = binary_cross_entropy(pred, y_train)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()
        train_accuracy += accuracy(pred, y_train).item()
        if step % cfg.log_every_steps == 0:
            log(writer=writer,
                step=step,
                loss=train_loss,
                accuracy=train_accuracy,
                averaging=cfg.log_every_steps if step != 0 else 1,
                mode='train')
            train_loss = 0.
            train_accuracy = 0.

        if step % cfg.eval_every_steps == 0:
            eval_scalenet(cfg=cfg,
                          step=step,
                          model=model,
                          eval_data=eval_data,
                          device=device,
                          writer=writer)

        if step % cfg.save_every_steps == 0:
            model.save(to_absolute_path(cfg.checkpoint_path), optimizer)
def predict(predict_config):
    test_df = read_data(to_absolute_path(predict_config.test_data_path))
    test_df = test_df.drop(predict_config.feature_params.target_col, axis=1)

    model_path = to_absolute_path(predict_config.output_model_path)
    model = load_model(model_path)

    transformer = load_transformer(
        to_absolute_path(predict_config.feature_transformer_path))
    test_features = make_features(transformer, test_df)
    y_pred = pd.DataFrame(model.predict_proba(test_features)[:, 1],
                          columns=["target"])

    y_pred.to_csv(to_absolute_path(predict_config.predict_path), index=False)
def CBIR_test(cfg):
    database = DataBase(cfg)
    # database.load_svs(path=to_absolute_path('/home/artem/data/PATH-DT-MSU-WSI/WSS1/04.svs'), dataset_name='PATH-DT',
    #                   scales=[5, 10, 15, 20, 25, 30, 35, 40])
    # database.extract_features('PATH-DT')
    # database.serialize(to_absolute_path('CBIR_serialized'))

    database.deserialize(to_absolute_path(cfg.features_serialization.path))
    query = np.array(Image.open(to_absolute_path(cfg.query)))
    search_result = database.search(query,
                                    top_n=10,
                                    detect_scale=True,
                                    log=True)
    print(search_result)
Beispiel #30
0
def my_app(config: DictConfig) -> None:
    global logger
    logger = getLogger(config.verbose)
    logger.info(OmegaConf.to_yaml(config))

    in_dir = to_absolute_path(config.in_dir)
    out_dir = to_absolute_path(config.out_dir)
    scaler_path = to_absolute_path(config.scaler_path)
    scaler = joblib.load(scaler_path)
    inverse = config.inverse
    num_workers = config.num_workers

    os.makedirs(out_dir, exist_ok=True)
    apply_normalization_dir2dir(in_dir, out_dir, scaler, inverse, num_workers)