def download_dataset_artifact(self, path, alias):
     if path.startswith(WANDB_ARTIFACT_PREFIX):
         dataset_artifact = wandb.use_artifact(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias)
         assert dataset_artifact is not None, "'Error: W&B dataset artifact doesn\'t exist'"
         datadir = dataset_artifact.download()
         return datadir, dataset_artifact
     return None, None
Ejemplo n.º 2
0
def load_models_from_artifact(cfg,
                              workers,
                              stage,
                              version="latest",
                              filename=None,
                              project="maschm/master-fed"):
    if filename is None: filename = stage
    model_artifact = wandb.use_artifact(
        f"{project}/{stage}-{cfg['model_variant']}:{version}", type='model')
    artifact_path = Path(model_artifact.download())
    print(f'Model: Use artifact "{model_artifact.name}"')

    for worker in workers:
        p = Path.cwd(
        ) / artifact_path / f"{worker.cfg['rank']}-v{worker.cfg['model_variant']}-m{worker.cfg['model_mapping']}-{stage}.pth"
        (worker.cfg['tmp'] / f"{filename}.pth").symlink_to(p)
        worker.model.load_state_dict(torch.load(p))

        p = Path.cwd(
        ) / artifact_path / f"{worker.cfg['rank']}-v{worker.cfg['model_variant']}-m{worker.cfg['model_mapping']}-{stage}_optim.pth"
        (worker.cfg['tmp'] / f"{filename}_optim.pth").symlink_to(p)

    wandb.run.summary[f"{stage}/acc"] = model_artifact.metadata['acc']
    wandb.run.summary[f"{stage}/loss"] = model_artifact.metadata['loss']
    return model_artifact, model_artifact.metadata
Ejemplo n.º 3
0
 def download_dataset_artifact(self, path, alias):
     if isinstance(path, str) and path.startswith(WANDB_ARTIFACT_PREFIX):
         artifact_path = Path(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias)
         dataset_artifact = wandb.use_artifact(artifact_path.as_posix())
         assert dataset_artifact is not None, "'Error: W&B dataset artifact doesn\'t exist'"
         datadir = dataset_artifact.download()
         return datadir, dataset_artifact
     return None, None
Ejemplo n.º 4
0
 def download_dataset_artifact(self, path, alias):
     if path.startswith(WANDB_ARTIFACT_PREFIX):
         dataset_artifact = wandb.use_artifact(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias)
         assert dataset_artifact is not None, "'Error: W&B dataset artifact doesn\'t exist'"
         datadir = dataset_artifact.download()
         labels_zip = Path(datadir) / "data/labels.zip"
         shutil.unpack_archive(labels_zip, Path(datadir) / 'data/labels', 'zip')
         print("Downloaded dataset to : ", datadir)
         return datadir, dataset_artifact
     return None, None
 def download_model_artifact(self, opt):
     if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
         model_artifact = wandb.use_artifact(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX) + ":latest")
         assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist'
         modeldir = model_artifact.download()
         epochs_trained = model_artifact.metadata.get('epochs_trained')
         total_epochs = model_artifact.metadata.get('total_epochs')
         assert epochs_trained < total_epochs, 'training to %g epochs is finished, nothing to resume.' % (
             total_epochs)
         return modeldir, model_artifact
     return None, None
Ejemplo n.º 6
0
 def download_model_artifact(self, opt):
     if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
         model_artifact = wandb.use_artifact(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX) + ":latest")
         assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist'
         modeldir = model_artifact.download()
         epochs_trained = model_artifact.metadata.get('epochs_trained')
         total_epochs = model_artifact.metadata.get('total_epochs')
         is_finished = total_epochs is None
         assert not is_finished, 'training is finished, can only resume incomplete runs.'
         return modeldir, model_artifact
     return None, None
Ejemplo n.º 7
0
    def load_state_dict(self):
        weight_artifact = wandb.use_artifact(
            self.config.pretrained_weight_path, type="pretrained_weight")
        weight_artifact_dir = weight_artifact.download()
        for agent_id, agent in enumerate(self.agents):
            new_state_dict = OrderedDict()
            state_dict = torch.load(weight_artifact_dir +
                                    f"/agent{agent_id}.pth")
            for key, value in state_dict.items():
                if key in agent.brain.network.state_dict().keys():
                    new_state_dict[key] = value

            agent.brain.network.load_state_dict(new_state_dict)
Ejemplo n.º 8
0
 def download_artifact(
     self,
     art_name: str,
     art_type: str,
     art_alias: str,
     dst_folder: Path | None = None,
 ) -> Path:
     self._wandb_init_if_needed()
     artifact: wandb.Artifact = wandb.use_artifact(
         artifact_or_name=f"{art_name}:{art_alias}", type=art_type)
     if dst_folder is None:
         dst_folder = Path(tempfile.mkdtemp())
     with switched_aws_cfg(self._s3_credentials_file):
         art_path: str = artifact.download(root=str(dst_folder))
     return Path(art_path)
Ejemplo n.º 9
0
 def download_model_artifact(self, opt):
     """
     download the model checkpoint artifact if the resume path starts with WANDB_ARTIFACT_PREFIX
     
     arguments:
     opt (namespace) -- Commandline arguments for this run
     """
     if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
         model_artifact = wandb.use_artifact(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX) + ":latest")
         assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist'
         modeldir = model_artifact.download()
         epochs_trained = model_artifact.metadata.get('epochs_trained')
         total_epochs = model_artifact.metadata.get('total_epochs')
         is_finished = total_epochs is None
         assert not is_finished, 'training is finished, can only resume incomplete runs.'
         return modeldir, model_artifact
     return None, None
Ejemplo n.º 10
0
def load_idx_from_artifact(cfg, targets, test_targets):
    idx_artifact_name = get_idx_artifact_name(cfg)
    try:
        idx_artifact = wandb.use_artifact(f"{idx_artifact_name}:latest",
                                          type='private_indices')
        # artifact_dir = idx_artifact.download()
        idx_file = idx_artifact.get_path('idxs.npy').download()
        idxs = np.load(idx_file, allow_pickle=True)
        test_idx_file = idx_artifact.get_path('test_idxs.npy').download()
        test_idxs = np.load(test_idx_file, allow_pickle=True)
        print(f'Private Idx: Use "{idx_artifact_name}" artifact with saved private indices')
        
    except (wandb.CommError, AttributeError) as e:
        print(e)
        print(f'Private Idx: Create "{idx_artifact_name}" artifact with new random private indices')

        idxs, counts, dists = partition_data(
            targets, cfg['parties'], cfg['classes'],
            cfg['partition_normalize'], cfg['samples'],
            cfg['concentration'], cfg['min_per_class'],
            cfg['partition_overlap'])


        test_idxs = partition_by_dist(test_targets, cfg['classes'], dists)
        idx_artifact = save_idx_to_artifact(cfg, idxs, counts, test_idxs)
    except Exception as e:
        raise e

    try:
        idx_artifact.wait()  # throws execption in offline mode
    except Exception as e:
        pass

    try:
        dists = idx_artifact.metadata['distributions']
        print("party distributions:\n", dists)
        total_party = idx_artifact.metadata['party_total']
        print("party total:\n", total_party)
        total_class = idx_artifact.metadata['class_total']
        print("class total:\n", total_class)
    except Exception as e:
        pass

    return idxs, test_idxs
Ejemplo n.º 11
0
 def download_dataset_artifact(self, path, alias):
     """
     download the model checkpoint artifact if the path starts with WANDB_ARTIFACT_PREFIX
     
     arguments:
     path -- path of the dataset to be used for training
     alias (str)-- alias of the artifact to be download/used for training
     
     returns:
     (str, wandb.Artifact) -- path of the downladed dataset and it's corresponding artifact object if dataset
     is found otherwise returns (None, None)
     """
     if isinstance(path, str) and path.startswith(WANDB_ARTIFACT_PREFIX):
         artifact_path = Path(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias)
         dataset_artifact = wandb.use_artifact(artifact_path.as_posix().replace("\\", "/"))
         assert dataset_artifact is not None, "'Error: W&B dataset artifact doesn\'t exist'"
         datadir = dataset_artifact.download()
         return datadir, dataset_artifact
     return None, None
Ejemplo n.º 12
0
def test_use_artifact_simple(runner, wandb_init_run):
    art = wandb.use_artifact("mnist:v0", type="dataset")
    assert art.name == "mnist:v0"
    path = art.download()
    assert os.path.exists(path)
Ejemplo n.º 13
0
        args_dict["command"] = ' '.join(sys.argv)
        popvision.save_app_info(args_dict)
        logging_handler = popvision.get_profile_logging_handler()
    else:
        logging_handler = None

    setup_logger(logging.getLevelName(args.log_level), logging_handler)

    if args.wandb and popdist_root(args):
        import wandb
        wandb.init(project="popart-bert",
                   config=args,
                   sync_tensorboard=True,
                   settings=wandb.Settings(console="wrap"))
        if args.wandb_checkpoint:
            artifact = wandb.use_artifact(args.wandb_checkpoint, type='model')
            artifact_dir = artifact.download()
            args.onnx_checkpoint = os.path.join(artifact_dir, "model.onnx")

    logger.info("Program Start")
    logger.info("Hostname: " + socket.gethostname())
    logger.info("Command Executed: " + str(sys.argv))

    # Run the main inference/training session by default
    if args.inference or not args.no_training:
        main(args)

    # If this was a training session and validation isn't disabled; validate.
    if not args.inference and not args.no_validation and not args.no_model_save and popdist_root(
            args):
        logger.info("Doing Validation")
Ejemplo n.º 14
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # ✍️ Create a new run in to Weights & Biases and set the project name ✍️
    project_name = "hf-sagemaker"
    job_type = 'Training'
    if training_args.run_name == 'tmp':
        name = f"{model_args.model_name_or_path}_{training_args.learning_rate}_{training_args.warmup_steps}"
    elif "hpt" in training_args.run_name:
        name = f"HypTn_{model_args.model_name_or_path}_{training_args.learning_rate}_{training_args.warmup_steps}"
        job_type = 'HyperparameterTuning'
    else:
        name = training_args.run_name

    wandb.init(name=name, project=project_name, job_type=job_type)
    wandb.run._label('sagemaker-hf')
    os.environ[
        "WANDB_LOG_MODEL"] = "TRUE"  # Hugging Face Trainer will use this to log model weights to W&B

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    if data_args.dataset_name == 'banking77_artifacts':

        # Download the tokenized Datasets from W&B Artifacts and load to HF Datasets object
        for split in ['train', 'eval']:
            pth = f'./{split}'
            nm = f"{split}_{model_args.model_name_or_path.split('/')[-1]}_tokenized"
            artifact = wandb.use_artifact(f'morgan/hf-sagemaker/{nm}:v0',
                                          type=f'{split}_tokenized_dataset')
            artifact_dir = artifact.download(pth)
            if split == 'train':
                train_dataset = load_from_disk(pth)
            else:
                eval_dataset = load_from_disk(pth)

    elif data_args.dataset_name is not None:
        raw_datasets = load_dataset(data_args.dataset_name,
                                    data_args.dataset_config_name,
                                    cache_dir=model_args.cache_dir)
    else:
        raise ValueError(f"dataset_name must be passed")

    # Labels
    is_regression = False
    if data_args.dataset_name == 'banking77_artifacts':
        label_list = train_dataset.features["label"].names
    else:
        label_list = raw_datasets["train"].features["label"].names

    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Map labels to ids
    label_to_id = {v: i for i, v in enumerate(label_list)}

    if label_to_id is not None:
        model.config.label2id = label_to_id
        model.config.id2label = {
            id: label
            for label, id in config.label2id.items()
        }

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        result = tokenizer(examples['text'],
                           padding=padding,
                           max_length=max_seq_length,
                           truncation=True)

        # Map labels to IDs (not necessary for GLUE tasks)
        if "label" in examples:
            result["label"] = examples["label"]
        return result

    if data_args.dataset_name != 'banking77_artifacts':
        with training_args.main_process_first(
                desc="dataset map pre-processing"):
            raw_datasets = raw_datasets.map(
                preprocess_function,
                batched=True,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on dataset",
            )
        if training_args.do_train:
            if "train" not in raw_datasets:
                raise ValueError("--do_train requires a train dataset")
            train_dataset = raw_datasets["train"]
            if data_args.max_train_samples is not None:
                train_dataset = train_dataset.select(
                    range(data_args.max_train_samples))

        if training_args.do_eval:
            if "validation" not in raw_datasets and "validation_matched" not in raw_datasets and "test" in raw_datasets:
                raw_datasets['validation'] = raw_datasets['test']
            elif "validation" not in raw_datasets and "validation_matched" not in raw_datasets and "test" not in raw_datasets:
                raise ValueError("--do_eval requires a validation dataset")
            eval_dataset = raw_datasets["validation_matched" if data_args.
                                        task_name == "mnli" else "validation"]
            if data_args.max_eval_samples is not None:
                eval_dataset = eval_dataset.select(
                    range(data_args.max_eval_samples))

    # ✍️ Log the training and eval datasets as a Weights & Biases Tables to Artifacts ✍️
    # Log only if we are not doing a hyperparameter sweep
    if "hpt" not in training_args.run_name:
        for d_idx, ds in enumerate([train_dataset, eval_dataset]):

            # Create W&B Table
            dataset_table = wandb.Table(
                columns=['id', 'label_id', 'label', 'text'])

            # Ensure different row ids when logging train and eval data
            if d_idx == 1:
                idx_step = len(train_dataset)
                nm = 'eval'
            else:
                idx_step = 0
                nm = 'train'

            # Add each row of data to the table
            for index in range(len(ds)):
                idx = index + idx_step

                lbl = ds[index]['label']
                row = [idx, lbl, model.config.id2label[lbl], ds[index]['text']]
                dataset_table.add_data(*row)

            # Log the table to Weights & Biases
            dataset_artifact = wandb.Artifact(
                f"{data_args.dataset_name}_{nm}_dataset", type=f"{nm}_dataset")
            dataset_artifact.add(dataset_table,
                                 f"{data_args.dataset_name}_{nm}")
            wandb.log_artifact(dataset_artifact)

    # Get the metric function
    metric = load_metric("accuracy")

    class ComputeMetrics:
        def __init__(self, train_len, eval_steps, log_predictions=False):
            self.train_len = train_len
            self.eval_steps = eval_steps
            self.log_predictions = log_predictions
            self.eval_step_count = eval_steps

        def __call__(self, p: EvalPrediction):
            preds = p.predictions[0] if isinstance(p.predictions,
                                                   tuple) else p.predictions
            preds_idxs = np.squeeze(preds) if is_regression else np.argmax(
                preds, axis=1)
            preds_vals = np.max(preds, axis=1)

            # Create W&B Table
            validation_table = wandb.Table(
                columns=['id', 'step', 'pred_label_id', 'pred_score'])

            if self.log_predictions:
                # Add predictions to your table
                for i, val_pred in enumerate(preds_idxs):
                    idx = i + len(train_dataset)
                    row = [idx, self.eval_step_count, val_pred, preds_vals[i]]
                    validation_table.add_data(*row)

                wandb.log(
                    {
                        f'eval_predictions_table/{data_args.dataset_name}_preds_step_{self.eval_step_count}':
                        validation_table
                    },
                    commit=False)
                # increment step count
                self.eval_step_count += self.eval_steps

            return {
                "accuracy":
                (preds_idxs == p.label_ids).astype(np.float32).mean().item()
            }

    # ✍️ Log the evaluation predictions at each evaluation to W&B Tables for model evaluation ✍️
    log_preds_to_wandb = "hpt" not in training_args.run_name
    compute_metrics = ComputeMetrics(len(train_dataset),
                                     training_args.eval_steps,
                                     log_preds_to_wandb)

    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        train_result = trainer.train()

    # ✍️ Finish the W&B run to tidy up the process ✍️
    wandb.finish()

    # Delete tmp folder to free up space on disk
    os.system(f"rm -rf {training_args.output_dir}")
Ejemplo n.º 15
0
 def download_model_artifact(self, name):
     model_artifact = wandb.use_artifact(name + ":latest")
     assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist'
     modeldir = model_artifact.download()
     print("Downloaded model to : ", modeldir)
     return modeldir, model_artifact