def __post_init__(self): super().__init__() if hasattr(self, '_data_points') and self._data_points is not None: self.data_point_ids = tuple(map(lambda d: d.id, self.data_points)) self._decoded_state = PersistedWork('_decoded_state', self, transient=True) self.state = 'n'
def __post_init__(self, weighted_split_path: Path): super().__post_init__() if weighted_split_path is None: path = '_label_counts' else: file_name = f'weighted-labels-{self.weighted_split_name}.dat' path = weighted_split_path / file_name self._label_counts = PersistedWork(path, self)
def __post_init__(self): super().__post_init__() if self.partition_attr is None: raise DatasetError("Missing 'partition_attr' field") dfpath = self.split_labels_path if dfpath is None: dfpath = '_strat_split_labels' self._strat_split_labels = PersistedWork(dfpath, self, mkdir=True)
def __post_init__(self): super().__post_init__() PersistableContainer.__init__(self) if not isinstance(self.split_container, SplitKeyContainer): raise DatasetError('Expecting type SplitKeyContainer but ' + f'got: {type(self.split_container)}') self._inst_split_name = None self._keys_by_split = PersistedWork('_keys_by_split', self) self._splits = PersistedWork('_splits', self)
def __post_init__(self, config_factory: ConfigFactory): super().__init__() self._init_config_factory(config_factory) self._config_factory = PersistedWork('_config_factory', self) self._executor = PersistedWork('_executor', self) self.debuged = False if self.progress_bar_cols == 'term': try: term_width = os.get_terminal_size()[0] # make space for embedded validation loss messages self.progress_bar_cols = term_width - 5 except OSError: logger.debug('unable to automatically determine ' + 'terminal width--skipping') self.progress_bar_cols = None
def __post_init__(self): super().__post_init__() if logger.isEnabledFor(logging.DEBUG): logger.debug('creating fd vec manager') if self.token_feature_ids is None: self.token_feature_ids = self.doc_parser.token_feature_ids else: feat_diff = self.token_feature_ids - self.doc_parser.token_feature_ids if len(feat_diff) > 0: fdiffs = ', '.join(feat_diff) raise VectorizerError( 'Parser token features do not exist in vectorizer: ' + f'{self.token_feature_ids} - ' + f'{self.doc_parser.token_feature_ids} = {fdiffs}') self._spacy_vectorizers = PersistedWork('_spacy_vectorizers', self)
def __post_init__(self, cache: bool): super().__init__() if self.cache_dir is not None and not self.cache_dir.exists(): if logger.isEnabledFor(logging.DEBUG): logger.info(f'creating cache directory: {self.cache_dir}') self.cache_dir.mkdir(parents=True, exist_ok=True) if self.cased is None: if self.model_id.find('uncased') >= 0: self.cased = False else: logger.info("'cased' not given--assuming a cased model") self.cased = True self._tokenizer = PersistedWork('_tokenzier', self, cache) self._model = PersistedWork('_model', self, cache) if self.cache_dir is not None and not self.cache_dir.exists(): if logger.isEnabledFor(logging.DEBUG): logger.info(f'creating cache directory: {self.cache_dir}') self.cache_dir.mkdir(parents=True, exist_ok=True) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'id: {self.model_id}, cased: {self.cased}')
def __post_init__(self, decoded_attributes): super().__post_init__() Deallocatable.__init__(self) # TODO: this class conflates key split and delegate stash functionality # in the `split_stash_container`. An instance of this type serves the # purpose, but it need not be. Instead it just needs to be both a # SplitKeyContainer and a Stash. This probably should be split out in # to two different fields. cont = self.split_stash_container if not isinstance(cont, SplitStashContainer) \ and (not isinstance(cont, SplitKeyContainer) or not isinstance(cont, Stash)): raise DeepLearnError('Expecting SplitStashContainer but got ' + f'{self.split_stash_container.__class__}') self.data_point_id_sets_path.parent.mkdir(parents=True, exist_ok=True) self._batch_data_point_sets = PersistedWork( self.data_point_id_sets_path, self) self.priming = False self.decoded_attributes = decoded_attributes self._update_comp_stash_attribs()
def __init__(self, use_gpu: bool = True, data_type: type = torch.float32, cuda_device_index: int = None): """Initialize this configuration. :param use_gpu: whether or not to use CUDA/GPU :param data_type: the default data type to use when creating new tensors in this configuration :param cuda_device_index: the CUDA device to use, which defaults to 0 if CUDA if ``use_gpu`` is ``True`` """ super().__init__() logger.debug(f'use_gpu: {use_gpu}') self.use_gpu = use_gpu self.data_type = data_type # we can't globally cache this in case there are multiple instances of # this class for which have different values of `use_gpu` self._init_device_pw = PersistedWork('_init_device_pw', self) self._cpu_device_pw = PersistedWork('_cpu_device_pw', self, cache_global=True) self._cpu_device_pw._mark_deallocated() self._cuda_device_index = cuda_device_index
def from_struct(cls: type, struct: Dict[str, Any], target_dir: Path) -> Distribution: """Return a distrbution directly from the data structure created from :class:`.Discoverer`. :param struct: the data structure given by :meth:`.Discoverer.freeze` using ``flatten=True`` :param target_dir: where the distribution will be *thawed* """ self = cls(None, None, target_dir, PathTranslator(target_dir)) self._struct = PersistedWork('_struct', self, initial_value=struct) return self
def __post_init__(self): self._previous_results = PersistedWork( '_previous_results', self, cache_global=self.cache_previous_results)
class FeatureDocumentVectorizerManager(FeatureVectorizerManager): """Creates and manages instances of :class:`.FeatureDocumentVectorizer` and parses text in to feature based document. This is used to manage the relationship of a given set of parsed features keeping in mind that parsing will usually happen as a preprocessing step. A second step is the vectorization of those features, which can be any proper subset of those features parsed in the previous step. However, these checks, of course, are not necessary if pickling isn't used across the parse and vectorization steps. Instances can set a hard fixed token length, but which vectorized tensors have a like fixed width based on the setting of :obj:`token_length`. However, this can also be set to use the longest sentence of the document, which is useful when computing vectorized tensors from the document as a batch, even if the input data are batched as a group of sentences in a document. :see: :class:`.FeatureDocumentVectorizer` :see :meth:`parse` """ doc_parser: FeatureDocumentParser = field() """Used to :meth:`parse` documents.""" token_length: int = field() """The length of tokens used in fixed length features. This is used as a dimension in decoded tensors. If this value is ``-1``, use the longest sentence of the document as the token length, which is usually counted as the batch. :see: :meth:`get_token_length` """ token_feature_ids: Set[str] = field(default=None) """Indicates which spaCy parsed features to generate in the vectorizers held in this instance. Examples include ``norm``, ``ent``, ``dep``, ``tag``. If this is not set, it defaults to the the `token_feature_ids` in :obj:`doc_parser`. :see: :obj:`.SpacyFeatureVectorizer.VECTORIZERS` """ def __post_init__(self): super().__post_init__() if logger.isEnabledFor(logging.DEBUG): logger.debug('creating fd vec manager') if self.token_feature_ids is None: self.token_feature_ids = self.doc_parser.token_feature_ids else: feat_diff = self.token_feature_ids - self.doc_parser.token_feature_ids if len(feat_diff) > 0: fdiffs = ', '.join(feat_diff) raise VectorizerError( 'Parser token features do not exist in vectorizer: ' + f'{self.token_feature_ids} - ' + f'{self.doc_parser.token_feature_ids} = {fdiffs}') self._spacy_vectorizers = PersistedWork('_spacy_vectorizers', self) @property def is_batch_token_length(self) -> bool: """Return whether or not the token length is variable based on the longest token length in the batch. """ return self.token_length < 0 def get_token_length(self, doc: FeatureDocument) -> int: """Get the token length for the document. If :obj:`is_batch_token_length` is ``True``, then the token length is computed based on the longest sentence in the document ``doc``. See the class docs. :param doc: used to compute the longest sentence if :obj:`is_batch_token_length` is ``True`` :return: the (global) token length for the document """ if self.is_batch_token_length: return doc.max_sentence_len else: return self.token_length def parse(self, text: Union[str, List[str]], *args, **kwargs) -> \ FeatureDocument: """Parse text or a text as a list of sentences. **Important**: Parsing documents through this manager instance is better since safe checks are made that features are available from those used when documents are parsed before pickling. :param text: either a string or a list of strings; if the former a document with one sentence will be created, otherwise a document is returned with a sentence for each string in the list """ return self.doc_parser.parse(text, *args, **kwargs) @property @persisted('_spacy_vectorizers') def spacy_vectorizers(self) -> Dict[str, SpacyFeatureVectorizer]: """Return vectorizers based on the :obj:`token_feature_ids` configured on this instance. Keys are token level feature ids found in :obj:`.SpacyFeatureVectorizer.VECTORIZERS`. :return: an :class:`collections.OrderedDict` of vectorizers """ if logger.isEnabledFor(logging.DEBUG): logger.debug('creating spacy vectorizers') token_feature_ids = set(SpacyFeatureVectorizer.VECTORIZERS.keys()) token_feature_ids = token_feature_ids & self.token_feature_ids token_feature_ids = sorted(token_feature_ids) vectorizers = collections.OrderedDict() if logger.isEnabledFor(logging.DEBUG): logger.debug(f'creating token features: {token_feature_ids}') for feature_id in sorted(token_feature_ids): cls = SpacyFeatureVectorizer.VECTORIZERS[feature_id] inst = cls(name=f'spacy vectorizer: {feature_id}', config_factory=self.config_factory, feature_id=feature_id, torch_config=self.torch_config, vocab=self.doc_parser.model.vocab) vectorizers[feature_id] = inst if logger.isEnabledFor(logging.DEBUG): logger.debug(f'created {len(vectorizers)} vectorizers') return vectorizers def deallocate(self): if self._spacy_vectorizers.is_set(): vecs = self.spacy_vectorizers for vec in vecs.values(): vec.deallocate() vecs.clear() super().deallocate()
def __post_init__(self): super().__post_init__() Deallocatable.__init__(self) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'split stash post init: {self.dataframe_path}') self._dataframe = PersistedWork(self.dataframe_path, self, mkdir=True)
class DataframeStash(ReadOnlyStash, Deallocatable, Writable, PrimeableStash, metaclass=ABCMeta): """A factory stash that uses a Pandas data frame from which to load. It uses the data frame index as the keys and :class:`pandas.Series` as values. The dataframe is usually constructed by reading a file (i.e.CSV) and doing some transformation before using it in an implementation of this stash. The dataframe created by :meth:`_get_dataframe` must have a string or integer index since keys for all stashes are of type :class:`str`. The index will be mapped to a string if it is an int automatically. """ dataframe_path: Path = field() """The path to store the pickeled version of the generated dataframe created with :meth:`_get_dataframe`. """ def __post_init__(self): super().__post_init__() Deallocatable.__init__(self) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'split stash post init: {self.dataframe_path}') self._dataframe = PersistedWork(self.dataframe_path, self, mkdir=True) def deallocate(self): super().deallocate() self._dataframe.deallocate() @abstractmethod def _get_dataframe(self) -> pd.DataFrame: """Get or create the dataframe """ pass def _prepare_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: dt = df.index.dtype if dt != object: if dt != int: s = f'Data frame index must be a string or int, but got: {dt}' raise DataframeError(s) else: df.index = df.index.map(str) return df @property @persisted('_dataframe') def dataframe(self): df = self._get_dataframe() df = self._prepare_dataframe(df) return df def prime(self): super().prime() self.dataframe def clear(self): if logger.isEnabledFor(logging.DEBUG): logger.debug('clearing dataframe stash') self._dataframe.clear() def load(self, name: str) -> pd.Series: return self.dataframe.loc[name] def exists(self, name: str) -> bool: return name in self.dataframe.index def keys(self) -> Iterable[str]: return map(str, self.dataframe.index) def write(self, depth: int = 0, writer: TextIOBase = sys.stdout): df = self.dataframe self._write_line(f'rows: {df.shape[0]}', depth, writer) self._write_line(f'cols: {", ".join(df.columns)}', depth, writer)
def __post_init__(self): super().__post_init__() self._keys_by_split = PersistedWork(self.key_path, self, mkdir=True)
class SplitKeyDataframeStash(DataframeStash, SplitKeyContainer): """A stash and split key container that reads from a dataframe. """ key_path: Path = field() """The path where the key splits (as a ``dict``) is pickled.""" split_col: str = field() """The column name in the dataframe used to indicate the split (i.e. ``train`` vs ``test``). """ def __post_init__(self): super().__post_init__() self._keys_by_split = PersistedWork(self.key_path, self, mkdir=True) def deallocate(self): super().deallocate() self._keys_by_split.deallocate() def _create_keys_for_split(self, split_name: str, df: pd.DataFrame) -> \ Iterable[str]: """Generate an iterable of string keys. It is expected this method to be potentially very expensive, so the results are cached to disk. This implementation returns the dataframe index. :param split_name: the name of the split (i.e. ``train`` vs ``test``) :param df: the data frame for the grouping of keys from CSV of data """ return df.index def _get_counts_by_key(self) -> Dict[str, int]: sc = self.split_col return dict(self.dataframe.groupby([sc])[sc].count().items()) @persisted('_split_names') def _get_split_names(self) -> Set[str]: return set(self.dataframe[self.split_col].unique()) @persisted('_keys_by_split') def _get_keys_by_split(self) -> Dict[str, Tuple[str]]: keys_by_split = OrderedDict() split_col = self.split_col for split, df in self.dataframe.groupby([split_col]): logger.info(f'parsing keys for {split}') keys = self._create_keys_for_split(split, df) keys_by_split[split] = tuple(keys) return keys_by_split def clear(self): super().clear() self.clear_keys() def clear_keys(self): """Clear only the cache of keys generated from the group by. """ self._keys_by_split.clear() def write(self, depth: int = 0, writer: TextIOBase = sys.stdout): total = self.dataframe.shape[0] self._write_line('data frame splits:', depth, writer) for split, cnt in self.counts_by_key.items(): self._write_line(f'{split}: {cnt} ({cnt/total*100:.1f}%)', depth, writer) self._write_line(f'total: {total}', depth, writer)
class BatchStash(TorchMultiProcessStash, SplitKeyContainer, Writeback, Deallocatable, metaclass=ABCMeta): """A stash that vectorizes features in to easily consumable tensors for training and testing. This stash produces instances of :class:`.Batch`, which is a batch in the machine learning sense, and the first dimension of what will become the tensor used in PyTorch. Each of these batches has a logical one to many relationship to that batche's respective set of data points, which is encapsulated in the :class:`.DataPoint` class. The stash creates subprocesses to vectorize features in to tensors in chunks of IDs (data point IDs) from the subordinate stash using ``DataPointIDSet`` instances. To speed up experiements, all available features configured in ``vectorizer_manager_set`` are encoded on disk. However, only the ``decoded_attributes`` (see attribute below) are avilable to the model regardless of what was created during encoding time. The lifecycle of the data follows: 1. Feature data created by the client, which could be language features, row data etc. 2. Vectorize the feature data using the vectorizers in ``vectorizer_manager_set``. This creates the feature contexts (``FeatureContext``) specifically meant to be pickeled. 3. Pickle the feature contexts when dumping to disk, which is invoked in the child processes of this class. 4. At train time, load the feature contexts from disk. 5. Decode the feature contexts in to PyTorch tensors. 6. The model manager uses the ``to`` method to copy the CPU tensors to the GPU (where GPUs are available). :see _process: for details on the pickling of the batch instances """ _DICTABLE_WRITE_EXCLUDES = {'batch_feature_mappings'} data_point_type: Type[DataPoint] = field() """A subclass type of :class:`.DataPoint` implemented for the specific feature. """ batch_type: Type[Batch] = field() """The batch class to be instantiated when created batchs. """ split_stash_container: SplitStashContainer = field() """The source data stash that has both the data and data set keys for each split (i.e. ``train`` vs ``test``). """ vectorizer_manager_set: FeatureVectorizerManagerSet = field() """Used to vectorize features in to tensors.""" batch_size: int = field() """The number of data points in each batch, except the last (unless the data point cardinality divides the batch size). """ model_torch_config: TorchConfig = field() """The PyTorch configuration used to (optionally) copy CPU to GPU memory. """ data_point_id_sets_path: Path = field() """The path of where to store key data for the splits; note that the container might store it's key splits in some other location. """ decoded_attributes: InitVar[Set[str]] = field() """The attributes to decode; only these are avilable to the model regardless of what was created during encoding time; if None, all are available. """ batch_feature_mappings: BatchFeatureMapping = field(default=None) """The meta data used to encode and decode each feature in to tensors. """ batch_limit: int = field(default=sys.maxsize) """The max number of batches to process, which is useful for debugging.""" def __post_init__(self, decoded_attributes): super().__post_init__() Deallocatable.__init__(self) # TODO: this class conflates key split and delegate stash functionality # in the `split_stash_container`. An instance of this type serves the # purpose, but it need not be. Instead it just needs to be both a # SplitKeyContainer and a Stash. This probably should be split out in # to two different fields. cont = self.split_stash_container if not isinstance(cont, SplitStashContainer) \ and (not isinstance(cont, SplitKeyContainer) or not isinstance(cont, Stash)): raise DeepLearnError('Expecting SplitStashContainer but got ' + f'{self.split_stash_container.__class__}') self.data_point_id_sets_path.parent.mkdir(parents=True, exist_ok=True) self._batch_data_point_sets = PersistedWork( self.data_point_id_sets_path, self) self.priming = False self.decoded_attributes = decoded_attributes self._update_comp_stash_attribs() @property def decoded_attributes(self) -> Set[str]: """The attributes to decode. Only these are avilable to the model regardless of what was created during encoding time; if None, all are available """ return self._decoded_attributes @decoded_attributes.setter def decoded_attributes(self, attribs: Set[str]): """The attributes to decode. Only these are avilable to the model regardless of what was created during encoding time; if None, all are available """ if logger.isEnabledFor(logging.DEBUG): logger.debug(f'setting decoded attributes: {attribs}') self._decoded_attributes = attribs if isinstance(self.delegate, BatchDirectoryCompositeStash): self.delegate.load_keys = attribs @property @persisted('_batch_metadata') def batch_metadata(self) -> BatchMetadata: mapping: BatchFeatureMapping if self.batch_feature_mappings is not None: mapping = self.batch_feature_mappings else: batch: Batch = self.batch_type(None, None, None, None) batch.batch_stash = self mapping = batch._get_batch_feature_mappings() batch.deallocate() vec_mng_set: FeatureVectorizerManagerSet = self.vectorizer_manager_set attrib_keeps = self.decoded_attributes vec_mng_names = set(vec_mng_set.keys()) by_attrib = {} mmng: ManagerFeatureMapping for mmng in mapping.manager_mappings: vec_mng_name: str = mmng.vectorizer_manager_name if vec_mng_name in vec_mng_names: vec_mng: FeatureVectorizerManager = vec_mng_set[vec_mng_name] field: FieldFeatureMapping for field in mmng.fields: if field.attr in attrib_keeps: vec = vec_mng[field.feature_id] by_attrib[field.attr] = BatchFieldMetadata(field, vec) return BatchMetadata(self.data_point_type, self.batch_type, mapping, by_attrib) def _update_comp_stash_attribs(self): """Update the composite stash grouping if we're using one and if this class is already configured. """ if isinstance(self.delegate, BatchDirectoryCompositeStash): meta: BatchMetadata = self.batch_metadata meta_attribs: Set[str] = set( map(lambda f: f.attr, meta.mapping.get_attributes())) groups: Tuple[Set[str]] = self.delegate.groups gattribs = reduce(lambda x, y: x | y, groups) to_remove = gattribs - meta_attribs new_groups = [] if len(to_remove) > 0: group: Set[str] for group in groups: ng: Set[str] = meta_attribs & group if len(ng) > 0: new_groups.append(ng) self.delegate.groups = tuple(new_groups) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'meta attribs: {meta_attribs}, groups: {groups}') @property @persisted('_batch_data_point_sets') def batch_data_point_sets(self) -> List[DataPointIDSet]: """Create the data point ID sets. Each instance returned will correlate to a batch and each set of keys point to a feature :class:`.DataPoint`. """ psets = [] batch_id = 0 cont = self.split_stash_container tc_seed = TorchConfig.get_random_seed_context() if logger.isEnabledFor(logging.INFO): logger.info(f'{self.name}: creating keys with ({type(cont)}) ' + f'using batch size of {self.batch_size}') for split, keys in cont.keys_by_split.items(): if logger.isEnabledFor(logging.INFO): logger.info(f'keys for split {split}: {len(keys)}') # keys are ordered and needed to be as such for consistency # keys = sorted(keys, key=int) cslice = it.islice(chunks(keys, self.batch_size), self.batch_limit) for chunk in cslice: chunk = tuple(chunk) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'chunked size: {len(chunk)}') dp_set = DataPointIDSet(str(batch_id), chunk, split, tc_seed) psets.append(dp_set) batch_id += 1 logger.info(f'created {len(psets)} each set limited with ' + f'{self.batch_limit} with batch_limit={self.batch_limit}') return psets def _get_keys_by_split(self) -> Dict[str, Tuple[str]]: by_batch = collections.defaultdict(lambda: []) for dps in self.batch_data_point_sets: by_batch[dps.split_name].append(dps.batch_id) return {k: tuple(by_batch[k]) for k in by_batch.keys()} def _create_data(self) -> List[DataPointIDSet]: """Data created for the sub proceesses are the first N data point ID sets. """ return self.batch_data_point_sets def populate_batch_feature_mapping(self, batch: Batch): """Add batch feature mappings to a batch instance.""" if self.batch_feature_mappings is not None: batch.batch_feature_mappings = self.batch_feature_mappings def create_batch(self, points: Tuple[DataPoint], split_name: str = None, batch_id: str = None): """Create a new batch instance with data points, which happens when primed. """ bcls: Type[Batch] = self.batch_type batch: Batch = bcls(self, batch_id, split_name, points) self.populate_batch_feature_mapping(batch) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'created batch: {batch}') return batch def _process(self, chunk: List[DataPointIDSet]) -> \ Iterable[Tuple[str, Any]]: """Create the batches by creating the set of data points for each :class:`.DataPointIDSet` instance. When the subordinate stash dumps the batch (specifically a subclass of :class:`.Batch`), the overrided pickle logic is used to *detatch* the batch by encoded all data in to :class:`.FeatureContext` instances. """ if logger.isEnabledFor(logging.INFO): logger.info(f'{self.name}: processing: {len(chunk)} data points') if logger.isEnabledFor(logging.DEBUG): logger.debug(f'chunk data points: {chunk}') tseed = chunk[0].torch_seed_context dpcls: Type[DataPoint] = self.data_point_type cont = self.split_stash_container if tseed is not None: TorchConfig.set_random_seed( tseed['seed'], tseed['disable_cudnn'], False) dset: DataPointIDSet for dset in chunk: batch_id: str = dset.batch_id points: Tuple[DataPoint] = tuple( map(lambda dpid: dpcls(dpid, self, cont[dpid]), dset.data_point_ids)) batch: Batch = self.create_batch(points, dset.split_name, batch_id) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'created batch: {batch}') yield (batch_id, batch) def _get_data_points_for_batch(self, batch: Any) -> Tuple[Any]: """Return the data points that were used to create ``batch``. """ dpcls = self.data_point_type cont = self.split_stash_container return tuple(map(lambda dpid: dpcls(dpid, self, cont[dpid]), batch.data_point_ids)) def load(self, name: str): with time('loaded batch {name} ({obj.split_name})'): obj = super().load(name) # add back the container of the batch to reconstitute the original # features and use the CUDA for tensor device transforms if obj is not None: if not hasattr(obj, 'batch_stash'): obj.batch_stash = self if (not hasattr(obj, 'batch_feature_mappings') or obj.batch_feature_mappings is None): self.populate_batch_feature_mapping(obj) return obj def _prime_vectorizers(self): vec_mng_set: FeatureVectorizerManagerSet = self.vectorizer_manager_set vecs = map(lambda v: v.values(), vec_mng_set.values()) for vec in chain.from_iterable(vecs): if isinstance(vec, Primeable): if logger.isEnabledFor(logging.DEBUG): logger.debug(f'priming {vec}') vec.prime() def prime(self): if logger.isEnabledFor(logging.DEBUG): logger.debug(f'priming {self.__class__}, is child: ' + f'{self.is_child}, currently priming: {self.priming}') if self.priming: raise DeepLearnError('Already priming') self.priming = True try: self.batch_data_point_sets self._prime_vectorizers() super().prime() finally: self.priming = False def deallocate(self): self._batch_data_point_sets.deallocate() if id(self.delegate) != id(self.split_stash_container): self._try_deallocate(self.delegate) self._try_deallocate(self.split_stash_container) self.vectorizer_manager_set.deallocate() super().deallocate() def _from_dictable(self, *args, **kwargs): # avoid long Wriable.write output dct = super()._from_dictable(*args, **kwargs) rms = tuple(filter(lambda k: k.startswith('_'), dct.keys())) for k in rms: del dct[k] return dct def clear(self): """Clear the batch, batch data point sets.""" logger.debug('clearing') super().clear() self._batch_data_point_sets.clear() def clear_all(self): """Clear the batch, batch data point sets, and the source data (:obj:`split_stash_container`). """ self.clear() self.split_stash_container.clear()
class StratifiedStashSplitKeyContainer(StashSplitKeyContainer): """Like :class:`.StashSplitKeyContainer` but data is stratified by a label (:obj:`partition_attr`) across each split. """ partition_attr: str = field(default=None) """The label used to partition the strata across each split""" stratified_write: bool = field(default=True) """Whether or not to include the stratified counts when writing with :meth:`write`. """ split_labels_path: Path = field(default=None) """If provided, the path is a pickled cache of :obj:`stratified_count_dataframe`. """ def __post_init__(self): super().__post_init__() if self.partition_attr is None: raise DatasetError("Missing 'partition_attr' field") dfpath = self.split_labels_path if dfpath is None: dfpath = '_strat_split_labels' self._strat_split_labels = PersistedWork(dfpath, self, mkdir=True) def _create_splits(self) -> Dict[str, Tuple[str]]: dist_keys: Sequence[str] = self.distribution.keys() dist_last: str = next(iter(dist_keys)) dists: Set[str] = set(dist_keys) - {dist_last} rows = [] for k, v in self.stash.items(): rows.append((k, getattr(v, self.partition_attr))) df = pd.DataFrame(rows, columns=['key', self.partition_attr]) lab_splits: Dict[str, Set[str]] = collections.defaultdict(set) for lab, dfg in df.groupby(self.partition_attr): splits = {} keys: List[str] = dfg['key'].to_list() if self.shuffle: random.shuffle(keys) count = len(keys) for dist in dists: prop = self.distribution[dist] n_samples = math.ceil(float(count) * prop) samp = set(keys[:n_samples]) splits[dist] = samp lab_splits[dist].update(samp) keys = keys[n_samples:] samp = set(keys) splits[dist_last] = samp lab_splits[dist_last].update(samp) assert sum(map(len, lab_splits.values())) == len(df) assert reduce(lambda a, b: a | b, lab_splits.values()) == \ set(df['key'].tolist()) shuf_splits = {} for lab, keys in lab_splits.items(): if self.shuffle: keys = list(keys) random.shuffle(keys) shuf_splits[lab] = tuple(keys) return shuf_splits def _count_proportions_by_split(self) -> Dict[str, Dict[str, str]]: lab_counts = {} kbs = self.keys_by_split for split_name in sorted(kbs.keys()): keys = kbs[split_name] counts = collections.defaultdict(lambda: 0) for k in keys: item = self.stash[k] lab = getattr(item, self.partition_attr) counts[lab] += 1 lab_counts[split_name] = counts return lab_counts @property @persisted('_strat_split_labels') def stratified_split_labels(self) -> pd.DataFrame: """A dataframe with all keys, their respective labels and split. """ kbs = self.keys_by_split rows = [] for split_name in sorted(kbs.keys()): keys = kbs[split_name] for k in keys: item = self.stash[k] lab = getattr(item, self.partition_attr) rows.append((split_name, k, lab)) return pd.DataFrame(rows, columns='split_name id label'.split()) def clear(self): super().clear() self._strat_split_labels.clear() @property def stratified_count_dataframe(self) -> pd.DataFrame: """A count summarization of :obj:`stratified_split_labels`. """ df = self.stratified_split_labels df = df.groupby('split_name label'.split()).size().\ reset_index(name='count') df['proportion'] = df['count'] / df['count'].sum() df = df.sort_values('split_name label'.split()).reset_index(drop=True) return df def _fmt_prop_by_split(self) -> Dict[str, Dict[str, str]]: df = self.stratified_count_dataframe tot = df['count'].sum() dsets: Dict[str, Dict[str, str]] = collections.OrderedDict() for split_name, dfg in df.groupby('split_name'): dfg['fmt'] = df['count'].apply(lambda x: f'{x/tot*100:.2f}%') dsets[split_name] = dict(dfg[['label', 'fmt']].values) return dsets def write(self, depth: int = 0, writer: TextIOBase = sys.stdout): if self.stratified_write: lab_counts: Dict[str, Dict[str, str]] = self._fmt_prop_by_split() self._write_dict(lab_counts, depth, writer) self._write_line(f'Total: {len(self.stash)}', depth, writer) else: super().write(depth, writer)
class ResultAnalyzer(object): """Load results from a previous run of the :class:`ModelExecutor` and a more recent run. This run is usually a currently running model to compare the results during training. This might provide meaningful information such as whether to early stop training. """ executor: ModelExecutor = field() """The executor (not the running executor necessary) that will load the results if not already loadded. """ previous_results_key: str = field() """The key given to retreive the previous results with :class:`ModelResultManager`. """ cache_previous_results: bool = field() """If ``True``, globally cache the previous results to avoid having to reload each time. """ def __post_init__(self): self._previous_results = PersistedWork( '_previous_results', self, cache_global=self.cache_previous_results) def clear(self): """Clear the previous results, if cached. """ self._previous_results.clear() @property @persisted('_previous_results') def previous_results(self) -> ModelResult: """Return the previous results (see class docs). """ rm: ModelResultManager = self.executor.result_manager if rm is None: rm = ModelError('No result manager available') return rm[self.previous_results_key] @property def current_results(self) -> Tuple[ModelResult, ModelResult]: """Return the current results (see class docs). """ if self.executor.model_result is None: self.executor.load() return self.executor.model_result @property def comparison(self) -> DataComparison: """Load the results data and create a comparison instance read to write or jsonify. """ prev, cur = self.previous_results, self.current_results prev_losses = prev.validation.losses cur_losses = cur.validation.losses cur_len = len(cur_losses) df = pd.DataFrame({'epoch': range(cur_len), 'previous': prev_losses[:cur_len], 'current': cur_losses}) df['improvement'] = df['previous'] - df['current'] return DataComparison(self.previous_results_key, prev, cur, df)
class ModelFacade(PersistableContainer, Writable): """This class provides easy to use client entry points to the model executor, which trains, validates, tests, saves and loads the model. More common attributes, such as the learning rate and number of epochs, are properties that dispatch to :py:obj:`executor`. For the others, go directly to the property. :see: :class:`zensols.deeplearn.domain.ModelSettings` """ SINGLETONS = {} config: Configurable = field() """The configuraiton used to create the facade, and used to create a new configuration factory to load models. """ config_factory: InitVar[ConfigFactory] = field(default=None) """The configuration factory used to create this facade, or ``None`` if no factory was used. """ progress_bar: bool = field(default=True) """Create text/ASCII based progress bar if ``True``.""" progress_bar_cols: Union[str, int] = field(default='term') """The number of console columns to use for the text/ASCII based progress bar. If the value is ``term``, then use the terminal width. """ executor_name: str = field(default='executor') """The configuration entry name for the executor, which defaults to ``executor``. """ writer: TextIOBase = field(default=sys.stdout) """The writer to this in methods like :meth:`train`, and :meth:`test` for writing performance metrics results and predictions or ``None`` to not output them. """ predictions_datafrmae_factory_class: Type[PredictionsDataFrameFactory] = \ field(default=PredictionsDataFrameFactory) """The factory class used to create predictions. :see: :meth:`get_predictions_factory` """ def __post_init__(self, config_factory: ConfigFactory): super().__init__() self._init_config_factory(config_factory) self._config_factory = PersistedWork('_config_factory', self) self._executor = PersistedWork('_executor', self) self.debuged = False if self.progress_bar_cols == 'term': try: term_width = os.get_terminal_size()[0] # make space for embedded validation loss messages self.progress_bar_cols = term_width - 5 except OSError: logger.debug('unable to automatically determine ' + 'terminal width--skipping') self.progress_bar_cols = None @classmethod def get_singleton(cls, *args, **kwargs) -> Any: key = str(cls) inst = cls.SINGLETONS.get(key) if inst is None: inst = cls(*args, **kwargs) cls.SINGLETONS[key] = inst return inst def _init_config_factory(self, config_factory: ConfigFactory): if isinstance(config_factory, ImportConfigFactory): params = config_factory.__dict__ keeps = set('reload shared reload_pattern'.split()) params = {k: params[k] for k in set(params.keys()) & keeps} if logger.isEnabledFor(logging.DEBUG): logger.debug(f'import config factory params: {params}') self._config_factory_params = params else: self._config_factory_params = {} def _create_executor(self) -> ModelExecutor: """Create a new instance of an executor. Used by :obj:`executor`. """ logger.info('creating new executor') executor = self.config_factory( self.executor_name, progress_bar=self.progress_bar, progress_bar_cols=self.progress_bar_cols) return executor @property @persisted('_config_factory') def config_factory(self): """The configuration factory used to create facades. """ return ImportConfigFactory(self.config, **self._config_factory_params) @property @persisted('_executor') def executor(self) -> ModelExecutor: """A cached instance of the executor tied to the instance of this class. """ return self._create_executor() @property def net_settings(self) -> NetworkSettings: """Return the executor's network settings. """ return self.executor.net_settings @property def model_settings(self) -> ModelSettings: """Return the executor's model settings. """ return self.executor.model_settings @property def result_manager(self) -> ModelResultManager: """Return the executor's result manager. """ rm: ModelResultManager = self.executor.result_manager if rm is None: rm = ModelError('No result manager available') return rm @property def feature_stash(self) -> Stash: """The stash used to generate the feature, which is not to be confused with the batch source stash ``batch_stash``. """ return self.executor.feature_stash @property def batch_stash(self) -> BatchStash: """The stash used to encode and decode batches by the executor. """ return self.executor.batch_stash @property def dataset_stash(self) -> DatasetSplitStash: """The stash used to encode and decode batches split by dataset. """ return self.executor.dataset_stash @property def vectorizer_manager_set(self) -> FeatureVectorizerManagerSet: """Return the vectorizer manager set used for the facade. This is taken from the executor's batch stash. """ return self.batch_stash.vectorizer_manager_set @property def batch_metadata(self) -> BatchMetadata: """Return the batch metadata used on the executor. :see: :class:`zensols.deepnlp.model.module.EmbeddingNetworkSettings` """ return self.batch_stash.batch_metadata @property def label_attribute_name(self): """Get the label attribute name. """ bmeta = self.batch_metadata if bmeta is not None: return bmeta.mapping.label_attribute_name def _notify(self, event: str, context: Any = None): """Notify observers of events from this class. """ self.model_settings.observer_manager.notify(event, self, context) def remove_metadata_mapping_field(self, attr: str) -> bool: """Remove a field by attribute if it exists across all metadata mappings. This is useful when a very expensive vectorizer slows down tasks, such as prediction, on a single run of a program. For this use case, override :meth:`predict` to call this method before calling the super ``predict`` method. :param attr: the name of the field's attribute to remove :return: ``True`` if the field was removed, ``False`` otherwise """ removed = False meta: BatchMetadata = self.batch_metadata mapping: BatchFeatureMapping for mapping in meta.mapping.manager_mappings: removed = removed or mapping.remove_field(attr) return removed @property def dropout(self) -> float: """The dropout for the entire network. """ return self.net_settings.dropout @dropout.setter def dropout(self, dropout: float): """The dropout for the entire network. """ self.net_settings.dropout = dropout @property def epochs(self) -> int: """The number of epochs for training and validation. """ return self.model_settings.epochs @epochs.setter def epochs(self, n_epochs: int): """The number of epochs for training and validation. """ self.model_settings.epochs = n_epochs @property def learning_rate(self) -> float: """The learning rate to set on the optimizer. """ return self.model_settings.learning_rate @learning_rate.setter def learning_rate(self, learning_rate: float): """The learning rate to set on the optimizer. """ self.executor.model_settings.learning_rate = learning_rate @property def cache_batches(self) -> bool: """The cache_batches for the entire network. """ return self.model_settings.cache_batches @cache_batches.setter def cache_batches(self, cache_batches: bool): """The cache_batches for the entire network. """ # if the caching strategy changed, be safe and deallocate and purge to # lazy recreate everything if self.model_settings.cache_batches != cache_batches: self.clear() self.model_settings.cache_batches = cache_batches def clear(self): """Clear out any cached executor. """ if logger.isEnabledFor(logging.INFO): logger.info('clearing') executor = self.executor config_factory = self.config_factory executor.deallocate() config_factory.deallocate() self._executor.clear() self._config_factory.clear() def reload(self): """Clears all state and reloads the configuration. """ self.clear() self.config.reload() def deallocate(self): super().deallocate() self.SINGLETONS.pop(str(self.__class__), None) @classmethod def load_from_path(cls, path: Path, *args, **kwargs) -> ModelFacade: """Construct a new facade from the data saved in a persisted model file. This uses the :py:meth:`.ModelManager.load_from_path` to reconstruct the returned facade, which means some attributes are taken from default if not taken from ``*args`` or ``**kwargs``. Arguments: Passed through to the initializer of invoking class ``cls``. :return: a new instance of a :class:`.ModelFacade` :see: :meth:`.ModelManager.load_from_path` """ if logger.isEnabledFor(logging.INFO): logger.info(f'loading from facade from {path}') mm = ModelManager.load_from_path(path) if 'executor_name' not in kwargs: kwargs['executor_name'] = mm.model_executor_name executor = mm.load_executor() executor.model_settings.path = path mm.config_factory.deallocate() facade: ModelFacade = cls(executor.config, *args, **kwargs) facade._config_factory.set(executor.config_factory) facade._executor.set(executor) return facade def debug(self, debug_value: Union[bool, int] = True): """Debug the model by setting the configuration to debug mode and invoking a single forward pass. Logging must be configured properly to get the output, which is typically just invoking :py:meth:`logging.basicConfig`. :param debug_value: ``True`` turns on executor debugging; if an ``int``, the higher the value, the more the logging """ executor = self.executor self._configure_debug_logging() executor.debug = debug_value executor.progress_bar = False executor.model_settings.batch_limit = 1 self.debuged = True executor.train() def persist_result(self): """Save the last recorded result during an :py:meth:`.Executor.train` or :py:meth:`.Executor.test` invocation to disk. Optionally also save a plotted graphics file to disk as well when :obj:`persist_plot_result` is set to ``True``. Note that in Jupyter notebooks, this method has the side effect of plotting the results in the cell when ``persist_plot_result`` is ``True``. :param persist_plot_result: if ``True``, plot and save the graph as a PNG file to the results directory """ executor = self.executor rmng: ModelResultManager = self.result_manager if executor.result_manager is not None: if logger.isEnabledFor(logging.DEBUG): logger.debug(f'dumping model result: {executor.model_result}') rmng.dump(executor.model_result) def train(self, description: str = None) -> ModelResult: """Train and test or just debug the model depending on the configuration. :param description: a description used in the results, which is useful when making incremental hyperparameter changes to the model """ executor = self.executor executor.reset() logger.info('training...') self._notify('train_start', description) with time('trained'): res = executor.train(description) self._notify('train_end', description) return res def test(self, description: str = None) -> ModelResult: """Load the model from disk and test it. """ if self.debuged: raise ModelError('Testing is not allowed in debug mode') executor = self.executor executor.load() logger.info('testing...') self._notify('test_start', description) with time('tested'): res = executor.test(description) if self.writer is not None: res.write(writer=self.writer) self._notify('test_end', description) return res def train_production(self, description: str = None) -> ModelResult: """Train on the training and test data sets, then test :param description: a description used in the results, which is useful when making incremental hyperparameter changes to the model """ executor = self.executor executor.reset() if self.writer is not None: executor.write(writer=self.writer) logger.info('training...') self._notify('train_production_start', description) with time('trained'): res = executor.train_production(description) self._notify('train_production_end', description) return res def predict(self, datas: Iterable[Any]) -> Any: """Make ad-hoc predictions on batches without labels, and return the results. :param datas: the data predict on, each as a separate element as a data point in a batch """ executor: ModelExecutor = self.executor ms: ModelSettings = self.model_settings if ms.prediction_mapper_name is None: raise ModelError( f'The model settings ({ms.name}) is not configured to create ' + "prediction batches: no set 'prediction_mapper'") pm: PredictionMapper = self.config_factory.new_instance( ms.prediction_mapper_name, datas, self.batch_stash) self._notify('predict_start') try: batches: List[Batch] = pm.batches if not executor.model_exists: executor.load() logger.info('predicting...') with time('predicted'): res: ModelResult = executor.predict(batches) eres: EpochResult = res.results[0] ret: Any = pm.map_results(eres) finally: self._notify('predict_end') pm.deallocate() return ret def stop_training(self): """Early stop training if the model is currently training. This invokes the :meth:`.TrainManager.stop`, communicates to the training process to stop on the next check. :return: ``True`` if the application is configured to early stop and the signal has not already been given """ self._notify('stop_training') return self.executor.train_manager.stop() @property def last_result(self) -> ModelResult: """The last recorded result during an :meth:`.ModelExecutor.train` or :meth:`.ModelExecutor.test` invocation is used. """ res = self.executor.model_result if res is None: rm: ModelResultManager = self.result_manager res = rm.load() if res is None: raise ModelError('No results found') return res def write_result(self, depth: int = 0, writer: TextIOBase = sys.stdout, include_settings: bool = False, include_converged: bool = False, include_config: bool = False): """Load the last set of results from the file system and print them out. The result to print is taken from :obj:`last_result` :param depth: the number of indentation levels :param writer: the data sink :param include_settings: whether or not to include model and network settings in the output :param include_config: whether or not to include the configuration in the output """ if logger.isEnabledFor(logging.INFO): logger.info('load previous results') res = self.last_result res.write(depth, writer, include_settings=include_settings, include_converged=include_converged, include_config=include_config) def plot_result(self, result: ModelResult = None, save: bool = False, show: bool = False) -> ModelResult: """Plot results and optionally save and show them. If this is called in a Jupyter notebook, the plot will be rendered in a cell. :param result: the result to plot, or if ``None``, use :py:meth:`last_result` :param save: if ``True``, save the plot to the results directory with the same naming as the last data results :param show: if ``True``, invoke ``matplotlib``'s ``show`` function to visualize in a non-Jupyter environment :return: the result used to graph, which comes from the executor when none is given to the invocation """ result = self.last_result if result is None else result grapher = self.executor.result_manager.get_grapher() grapher.plot([result]) if save: grapher.save() if show: grapher.show() return result def get_predictions_factory(self, column_names: List[str] = None, transform: Callable[[DataPoint], tuple] = None, batch_limit: int = sys.maxsize, name: str = None) \ -> PredictionsDataFrameFactory: """Generate a predictions factoty from the test data set. :param column_names: the list of string column names for each data item the list returned from ``data_point_transform`` to be added to the results for each label/prediction :param transform: a function that returns a tuple, each with an element respective of ``column_names`` to be added to the results for each label/prediction; if ``None`` (the default), ``str`` used (see the `Iris Jupyter Notebook <https://github.com/plandes/deeplearn/blob/master/notebook/iris.ipynb>`_ example) :param batch_limit: the max number of batche of results to output :param name: the name/ID (name of the file sans extension in the results directory) of the previously archived saved results to fetch or ``None`` to get the last result """ rm: ModelResultManager = self.result_manager res: ModelResult if name is None: res = self.last_result key: str = rm.get_last_key(False) else: res = rm.results_stash[name].model_result key: str = name if res is None: raise ModelError(f'No test results found: {name}') if not res.test.contains_results: raise ModelError('No test results found') path: Path = rm.key_to_path(key) return self.predictions_datafrmae_factory_class( path, res, self.batch_stash, column_names, transform, batch_limit) def get_predictions(self, *args, **kwargs) -> pd.DataFrame: """Generate a Pandas dataframe containing all predictions from the test data set. This method is meant to be overridden by application specific facades to customize prediction output. :see: :meth:`get_predictions_factory` :param args: arguments passed to :meth:`get_predictions_factory` :param kwargs: arguments passed to :meth:`get_predictions_factory` """ df_fac = self.get_predictions_factory(*args, **kwargs) return df_fac.dataframe def write_predictions(self, lines: int = 10): """Print the predictions made during the test phase of the model execution. :param lines: the number of lines of the predictions data frame to be printed :param writer: the data sink """ preds = self.get_predictions() print(preds.head(lines), file=self.writer) def get_result_analyzer(self, key: str = None, cache_previous_results: bool = False) \ -> ResultAnalyzer: """Return a results analyzer for comparing in flight training progress. """ rm: ModelResultManager = self.result_manager if key is None: key = rm.get_last_key() return ResultAnalyzer(self.executor, key, cache_previous_results) @property def class_explorer(self) -> FacadeClassExplorer: return self._create_facade_explorer() def _create_facade_explorer(self) -> FacadeClassExplorer: """Return a facade explorer used to print the facade's object graph. """ return FacadeClassExplorer() def write(self, depth: int = 0, writer: TextIOBase = None, include_executor: bool = True, include_metadata: bool = True, include_settings: bool = True, include_model: bool = True, include_config: bool = False, include_object_graph: bool = False): writer = self.writer if writer is None else writer writer = sys.stdout if writer is None else writer bmeta = None try: bmeta = self.batch_metadata except AttributeError: pass if include_executor: self._write_line(f'{self.executor.name}:', depth, writer) self.executor.write(depth + 1, writer, include_settings=include_settings, include_model=include_model) if include_metadata and bmeta is not None: self._write_line('metadata:', depth, writer) bmeta.write(depth + 1, writer) if include_object_graph: self._write_line('graph:', depth, writer) ce = self._create_facade_explorer() ce.write(self, depth=depth + 1, writer=writer) if include_config: self._write_line('config:', depth, writer) self.config.write(depth + 1, writer) def _deallocate_config_instance(self, inst: Any): if isinstance(self.config_factory, ImportConfigFactory): inst = self.config_factory.clear_instance(inst) dealloc = isinstance(inst, Deallocatable) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'deallocate {inst}: {type(inst)}: {dealloc}') if dealloc: inst.deallocate() def _configure_debug_logging(self): """When debuging the model, configure the logging system for output. The correct loggers need to be set to debug mode to print the model debugging information such as matrix shapes. """ for name in ['zensols.deeplearn.model', __name__]: logging.getLogger(name).setLevel(logging.DEBUG) def _configure_cli_logging(self, info_loggers: List[str], debug_loggers: List[str]): info_loggers.extend([ # multi-process (i.e. batch creation) 'zensols.multi.stash', 'zensols.deeplearn.batch.multi', # validation/training loss messages 'zensols.deeplearn.model.executor.status', __name__ ]) if not self.progress_bar: info_loggers.extend([ # load messages 'zensols.deeplearn.batch.stash', # save results messages 'zensols.deeplearn.result', # validation/training loss messages 'zensols.deeplearn.model.executor.progress', # model save/load 'zensols.deeplearn.model.manager', # early stop messages 'zensols.deeplearn.model.trainmng', # performance metrics formatting 'zensols.deeplearn.model.format', # model save messages 'zensols.deeplearn.result.manager', # observer module API messages 'zensols.deeplearn.observer.status', #'zensols.deeplearn.observer.event', # CLI interface 'zensols.deeplearn.cli.app' ]) @staticmethod def configure_default_cli_logging(log_level: int = logging.WARNING): """Configure the logging system with the defaults. """ fmt = '%(asctime)s[%(levelname)s]%(name)s: %(message)s' logging.basicConfig(format=fmt, level=log_level) def configure_cli_logging(self, log_level: int = None): """"Configure command line (or Python REPL) debugging. Each facade can turn on name spaces that make sense as useful information output for long running training/testing iterations. This calls "meth:`_configure_cli_logging` to collect the names of loggers at various levels. """ info = [] debug = [] if log_level is not None: self.configure_default_cli_logging(log_level) self._configure_cli_logging(info, debug) for name in info: logging.getLogger(name).setLevel(logging.INFO) for name in debug: logging.getLogger(name).setLevel(logging.DEBUG) def configure_jupyter(self, log_level: int = logging.WARNING, progress_bar_cols: int = 120): """Configures logging and other configuration related to a Jupyter notebook. This is just like :py:meth:`configure_cli_logging`, but adjusts logging for what is conducive for reporting in Jupyter cells. ;param log_level: the default logging level for the logging system :param progress_bar_cols: the number of columns to use for the progress bar """ self.configure_cli_logging(log_level) for name in [ # turn off loading messages 'zensols.deeplearn.batch.stash', # turn off model save messages 'zensols.deeplearn.result.manager' ]: logging.getLogger(name).setLevel(logging.WARNING) # number of columns for the progress bar self.executor.progress_bar_cols = progress_bar_cols # turn off console output (non-logging) self.writer = None @staticmethod def get_encode_sparse_matrices() -> bool: """Return whether or not sparse matricies are encoded. :see: :meth:`set_sparse` """ return SparseTensorFeatureContext.USE_SPARSE @staticmethod def set_encode_sparse_matrices(use_sparse: bool = False): """If called before batches are created, encode all tensors the would be encoded as dense rather than sparse when ``use_sparse`` is ``False``. Oherwise, tensors will be encoded as sparse where it makes sense on a per vectorizer basis. """ SparseTensorFeatureContext.USE_SPARSE = use_sparse
class FacadeApplication(Deallocatable): """Base class for applications that use :class:`.ModelFacade`. """ CLI_META = {'mnemonic_excludes': {'get_cached_facade', 'create_facade', 'deallocate', 'clear_cached_facade'}, 'option_overrides': {'model_path': {'long_name': 'model', 'short_name': None}}} """Tell the command line app API to igonore subclass and client specific use case methods. """ config: Configurable = field() """The config used to create facade instances.""" facade_name: str = field(default='facade') """The client facade.""" # simply copy this field and documentation to the implementation class to # add model path location (for those subclasses that don't have the # ``CLASS_INSPECTOR`` class level attribute set (see # :obj:`~zensols.util.introspect.inspect.ClassInspector.INSPECT_META`); # this can also be set as a parameter such as with # :methd:`.FacadeModelApplication.test` model_path: Path = field(default=None) """The path to the model or use the last trained model if not provided. """ config_factory_args: Dict[str, Any] = field(default_factory=dict) """The arguments given to the :class:`~zensols.config.ImportConfigFactory`, which could be useful for reloading all classes while debugingg. """ config_overwrites: Configurable = field(default=None) """A configurable that clobbers any configuration in :obj:`config` for those sections/options set. """ def __post_init__(self): self.dealloc_resources = [] self._cached_facade = PersistedWork('_cached_facade', self, True) def _enable_cli_logging(self, facade: ModelFacade): facade.progress_bar = False facade.configure_cli_logging() def create_facade(self) -> ModelFacade: """Create a new instance of the facade.""" # we must create a new (non-shared) instance of the facade since it # will get deallcated after complete. config = self.config model_path = self.model_path if self.config_overwrites is not None: config = cp.deepcopy(config) config.merge(self.config_overwrites) if model_path is None: cf = ImportConfigFactory(config, **self.config_factory_args) facade: ModelFacade = cf.instance(self.facade_name) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'created facade: {facade}') self.dealloc_resources.extend((cf, facade)) else: if logger.isEnabledFor(logging.INFO): logger.info(f'loading model from {model_path}') with dealloc(ImportConfigFactory( config, **self.config_factory_args)) as cf: cls: Type[ModelFacade] = cf.get_class(self.facade_name) facade: ModelFacade = cls.load_from_path(model_path) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'created facade: {type(facade)} ' + f'from path: {model_path}') self.dealloc_resources.append(facade) return facade @persisted('_cached_facade') def get_cached_facade(self, path: Path = None) -> ModelFacade: """Return a created facade that is cached in this application instance. """ return self.create_facade() def clear_cached_facade(self): """Clear any cached facade this application instance. """ if self._cached_facade.is_set(): self._cached_facade().deallocate() self._cached_facade.clear() def deallocate(self): super().deallocate() self._try_deallocate(self.dealloc_resources, recursive=True) self._cached_facade.deallocate()
def __post_init__(self): self.dealloc_resources = [] self._cached_facade = PersistedWork('_cached_facade', self, True)