def copy(source, destination, lazy=True, src_hash=None, dst_hash=None, checksum=md5_checksum): """Copy file from *source* to *destination*. If ``lazy`` is set to ``True`` and ``destination`` exists, the file **will be** copied **if and only if** *destination* is a different file (content-wise) than *source*. Args: source (PathLike): The first file to compare. destination (PathLike): The second file to compare. lazy (bool): Optional. Default to ``True``. If set to ``True``, actual copy performed if *destination* exists and has a different content than *source*. src_hash (str): Optional. Default to ``None``. If provided, it is used in place of the first file checksum to compare content. dst_hash (str): Optional. Default to ``None``. If provided, it is used in place of the second file checksum to compare content. checksum (Callable): A function which computes a checksum from a |Path|. Raises: OSError: If something went wrong during copy. """ source = Path(source) destination = Path(destination) if source == destination: return if not source.is_file(): raise OSError('Invalid source: {} is not a file.'.format(source)) if destination.exists() and not destination.is_file(): raise OSError( 'Invalid destination: {} exists but is not a file.'.format( destination)) if not destination.exists(): lazy = False if (destination.is_file() and lazy) and is_duplicate( source, destination, src_hash, dst_hash, checksum=checksum): return shutil.copy(str(source), str(destination))
def initialisation(path, checkpoint_reference=None, name=None): """Construct a valid PMF model initialisation from a |Path|. The return type depends on the initialisation type: * If the model was initialised from a PMF model, the function will return a |Model| instance. * If the model was initialised from a single file, the function will return a |Checkpoint|. Args: path (PathLike): A valid |Path| to the model initialisation. checkpoint_reference (hashable): If initialising from a PMF model, one needs to provided a reference to the actual model |Checkpoint| used to construct a valid initialisation. name (str): If initialising from a file, one needs to provide a name used to identify the initial network to construct a valid initialisation. Returns: |Model|, |Checkpoint|: The retrieved model initialisation. Raises: OSError: If the path provided does not exists on the filesystem or points to neither a file nor a PMF model. ValueError: If the arguments provided are incompatibles: e.g. using a PMF initialisation without a checkpoint_reference. PlumsModelTreeValidationError: If the path provided point to an invalid PMF model. PlumsModelMetadataValidationError: If the path provided point to a PMF model with an invalid metadata. """ path = Path(path) if not path.exists(): raise OSError('Invalid path provided: {} does not exists'.format(path)) if not path.is_dir(): if path.is_file(): if name is None: raise ValueError( 'Invalid arguments provided: {} points to a file but name is None.' .format(path)) return Checkpoint(name, path) elif (path / 'metadata.yaml').is_file(): if checkpoint_reference is None: raise ValueError( 'Invalid arguments provided: ' '{} points to a PMF model but no checkpoint reference was given.' .format(path)) model = Model.load(path, checkpoints=False) if checkpoint_reference not in model.checkpoint_collection: raise ValueError('Invalid arguments provided: ' '{} points to a PMF model which does not ' 'contains {} as a checkpoint.'.format( path, checkpoint_reference)) model._checkpoint = checkpoint_reference return model else: raise OSError( 'Invalid path provided: {} must either be a PMF model or a weight file.' .format(path))
def save(self, path, force=False, **kwargs): """Save a |Model| to |Path|. Args: path (PathLike): The |Path| where to save. force (bool): Optional. Default to ``False``. If path is an existing non-PMF path or a PMF model with the same :attr:`id`, do not raise and carry on saving. Raises: ValueError: If ``path`` points to a file. OSError: If ``path`` points to: * A non-empty directory which does not contains a PMF model and ``force`` is ``False``. * A non-empty directory which contains a PMF model with the same :attr:`id` and ``force`` is ``False``. * A non-empty directory which contains a PMF model with a different :attr:`id`. * A non-empty directory which contains a PMF model with an invalid metadata file. """ # TODO: Improve docstring. path = Path(path) model_dst = Mock() # sanity checks if path.exists(): if path.is_file(): raise ValueError('Invalid path: {} is a file.'.format(path)) if (path / 'metadata.yaml').exists(): with open(str(path / 'metadata.yaml'), 'r') as f: metadata = yaml.safe_load(f) try: metadata = Metadata().validate(metadata) except (SchemaError, PlumsValidationError): # If the metadata file happens to be invalid, we might enter uncharted territories we are not # prepared for. Abort ! raise OSError( 'Invalid path: {} is not a valid PMF metadata file.'. format(path / 'metadata.yaml')) if metadata['model']['id'] != self.id: # If the destination model id is different from ours, we might enter uncharted territories we are # not prepared for. Abort ! raise OSError( 'Invalid path: {} has a different PMF model id ' '({} != {}).'.format(path / 'metadata.yaml', self.id, metadata['model']['id'])) try: model_dst = Model.load(path, checkpoints=kwargs.get( 'checkpoints', True)) except (SchemaError, PlumsValidationError): if not force: raise OSError( 'Invalid path: {} is an invalid PMF model ' 'with the same model id ({}).'.format( path / 'metadata.yaml', self.id)) # Use the insider fail-agnostic back door to load what we can from the model anyway model_dst = Model._init_from_path(path, metadata) # We remove PMF related elements as the previous written model is not valid, not that is the # deletion fails, we ignore it because a valid PMF model will be written anyway and we never # assume the save destination to be empty. rmtree(path, ignore_errors=True, black_list=('metadata', model_dst.producer.configuration)) else: if not force: raise OSError( 'Invalid path: {} already exists.'.format(path)) # Initialize destination path.mkdir(parents=True, exist_ok=True) # Prepare metadata dictionary __metadata__ = { 'format': { 'version': self.__version__, 'producer': { 'name': self.producer.name, 'version': { 'format': self.producer.version.format, 'value': self.producer.version.version } } }, 'model': { 'name': self.name, 'id': self.id, 'training': { 'status': self.training.status, 'start_epoch': self.training.start_epoch, 'start_time': self.training.start_timestamp, 'latest_epoch': self.training.latest_epoch, 'latest_time': self.training.latest_timestamp, 'end_epoch': self.training.end_epoch, 'end_time': self.training.end_timestamp, 'latest': self.checkpoint_collection.latest, 'checkpoints': {} }, 'initialisation': None, 'configuration': {} } } # Initialize directory (path / 'data' / 'checkpoints').mkdir(parents=True, exist_ok=True) # Save build parameters # It should be a rather small file, so blindingly overriding it # should be faster than write-in-temp and lazy-copy with open(str(path / 'data' / 'build_parameters.yaml'), 'w') as f: yaml.safe_dump(self.build_parameters, f) # Copy configuration configuration_dst = path / self.producer.configuration[-1] copy(str(self.producer.configuration), str(configuration_dst), lazy=model_dst is not None) # Add configuration to metadata __metadata__['model']['configuration'].update({ 'path': str(configuration_dst.anchor_to_path(path)), 'hash': md5_checksum(self.producer.configuration) }) # Copy initialisation if self.initialisation is None: (path / 'data' / 'initialisation').mkdir(parents=True, exist_ok=True) if isinstance(self.initialisation, Checkpoint): (path / 'data' / 'initialisation').mkdir(parents=True, exist_ok=True) checkpoint_dst = path / 'data' / 'initialisation' / self.initialisation.path[ -1] copy(str(self.initialisation.path), str(checkpoint_dst), lazy=model_dst is not None, src_hash=self.initialisation.hash, dst_hash=getattr(model_dst.initialisation, 'name', None)) # Add file initialisation to metadata __metadata__['model']['initialisation'] = { 'file': { 'name': str(self.initialisation.name), 'path': str(checkpoint_dst.anchor_to_path(path)), 'hash': self.initialisation.hash } } if isinstance(self.initialisation, Model): self.initialisation.save(path / 'data' / 'initialisation', force=force, checkpoints=False) # Add PMF initialisation to metadata __metadata__['model']['initialisation'] = { 'pmf': { 'name': self.initialisation.name, 'id': self.initialisation.id, 'path': str((path / 'data' / 'initialisation').anchor_to_path(path)), 'checkpoint': self.initialisation.checkpoint } } # Copy checkpoint_collection for reference, checkpoint in self.checkpoint_collection.items(): checkpoint_dst = path / 'data' / 'checkpoints' / checkpoint.path[-1] \ if kwargs.get('checkpoints', True) else None # Add checkpoint to metadata __metadata__['model']['training']['checkpoints'][reference] = { 'epoch': checkpoint.epoch, 'path': str(checkpoint_dst.anchor_to_path(path)) if kwargs.get( 'checkpoints', True) else '.', 'hash': checkpoint.hash } # If needed (usually), copy file to destination if kwargs.get('checkpoints', True): copy(str(checkpoint.path), str(checkpoint_dst), lazy=model_dst is not None, src_hash=checkpoint.hash, dst_hash=model_dst.checkpoint_collection.get( checkpoint.name)) # Save metadata with open(str(path / 'metadata.yaml'), 'w') as f: yaml.safe_dump(__metadata__, f)
class Checkpoint(object): """Define a checkpoint Python representation. A |Checkpoint| might be defined by the following parameters: * A :attr:`name`, a :attr:`path` and a :attr:`hash`. * A :attr:`name` and a :attr:`hash`. * A :attr:`name` and a :attr:`path`. Note that although the :attr:`epoch` is never needed to strictly define a |Checkpoint|, it is compulsory to inject it into a |CheckpointCollection|. Args: name (hashable): The |Checkpoint| unique identifier. path (Pathlike): Optional. default to ``None``. The path to the |Checkpoint| data file. epoch (int): Optional. default to ``None``. The |Checkpoint| epoch, if known. hash (str): Optional. default to ``None``. The |Checkpoint| data file checksum. Attributes: name (hashable): The |Checkpoint| unique identifier. path (Pathlike): The path to the |Checkpoint| data file. epoch (int): The |Checkpoint| epoch, if known. hash (str): The |Checkpoint| data file checksum. """ def __init__(self, name, path=None, epoch=None, hash=None): self.path = Path(path) if path is not None else path if self.path is not None and not self.path.is_file(): raise OSError('Invalid checkpoint: {} is not a file.'.format(path)) self.name = name self.epoch = epoch self.hash = MD5Checksum().validate( hash) if hash is not None else md5_checksum(self.path) def __repr__(self): """Return a representation of a |Checkpoint|.""" return '{}(name={}, path={}, epoch={}, hash={})'.format( self.__class__.__name__, self.name, self.path, self.epoch, self.hash) __str__ = __repr__ def __eq__(self, other): """Return whether two |Checkpoint| have the same :attr:`epoch` and the same :attr:`hash`. Args: other (|Checkpoint|): A |Checkpoint| to compare to. Returns: bool: ``True`` if both have the same :attr:`epoch` and the same :attr:`hash`. """ try: return self.epoch == other.epoch and self.hash == other.hash except AttributeError: return NotImplemented def __ne__(self, other): """Return whether two |Checkpoint| do not have the same :attr:`epoch` and the same :attr:`hash`. Args: other (|Checkpoint|): A |Checkpoint| to compare to. Returns: bool: ``True`` if none have the same :attr:`epoch` and the same :attr:`hash`. """ return not self == other def __hash__(self): """Return a valid hash for hte |Checkpoint|. Returns: str: A hash consisting in the hash of the (:attr:`epoch`, :attr:`hash`) tuple. """ return hash((self.epoch, self.hash))