def _write_max_word_piece_token_length(self): with dealloc(self.create_facade()) as facade: facade.remove_expensive_vectorizers() self._test_transform() self._test_decode() logger.info('calculatating word piece length on data set...') # this takes a while since it iterates through the corpus with dealloc(self.create_facade()) as facade: mlen = facade.get_max_word_piece_len() print(f'max word piece token length: {mlen}')
def print_information(self, info_item: InfoItem = None): """Output facade data set, vectorizer and other configuration information. :param info_item: what to print """ # see :class:`.FacadeApplicationFactory' def write_batch(): for batch in it.islice(facade.batch_stash.values(), 2): batch.write() if not hasattr(self, '_no_op'): with dealloc(self.create_facade()) as facade: print(f'{facade.model_settings.model_name}:') fn_map = \ {None: facade.write, InfoItem.meta: facade.batch_metadata.write, InfoItem.param: facade.executor.write_settings, InfoItem.model: facade.executor.write_model, InfoItem.config: facade.config.write, InfoItem.batch: write_batch} fn = fn_map.get(info_item) if fn is None: raise DeepLearnError(f'No such info item: {info_item}') fn()
def create_facade(self) -> ModelFacade: """Create a new instance of the facade.""" # we must create a new (non-shared) instance of the facade since it # will get deallcated after complete. config = self.config model_path = self.model_path if self.config_overwrites is not None: config = cp.deepcopy(config) config.merge(self.config_overwrites) if model_path is None: cf = ImportConfigFactory(config, **self.config_factory_args) facade: ModelFacade = cf.instance(self.facade_name) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'created facade: {facade}') self.dealloc_resources.extend((cf, facade)) else: if logger.isEnabledFor(logging.INFO): logger.info(f'loading model from {model_path}') with dealloc(ImportConfigFactory( config, **self.config_factory_args)) as cf: cls: Type[ModelFacade] = cf.get_class(self.facade_name) facade: ModelFacade = cls.load_from_path(model_path) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'created facade: {type(facade)} ' + f'from path: {model_path}') self.dealloc_resources.append(facade) return facade
def train_test(self): """Train, test the model, then dump the results with a graph. """ with dealloc(self.create_facade()) as facade: facade.train() facade.test() facade.persist_result()
def _test_transform(self): with dealloc(self.create_facade()) as facade: model = facade.transformer_trainable_embedding_model doc = facade.doc_parser.parse(self.sent) tdoc = model.tokenize(doc) tdoc.write() arr: Tensor = model.transform(tdoc) print(arr.shape)
def _test_decode(self): with dealloc(self.create_facade()) as facade: sents = tuple(it.islice(facade.feature_stash.values(), 3)) doc = FeatureDocument(sents) vec = facade.language_vectorizer_manager['syn'] from zensols.util.log import loglevel with loglevel('zensols.deepnlp'): vec.encode(doc)
def train(self): """Train the model and dump the results, including a graph of the train/validation loss. """ with dealloc(self.create_facade()) as facade: facade.train() facade.persist_result()
def train_production(self): """Train, test the model on train and test datasets, then dump the results with a graph. """ with dealloc(self.create_facade()) as facade: facade.train_production() facade.test() facade.persist_result()
def debug(self, debug_value: int = None): """Debug the model. :param debug_value: the executor debugging level """ debug_value = True if debug_value is None else debug_value with dealloc(self.create_facade()) as facade: facade.debug(debug_value)
def majority_label_metrics(self, res_id: str = None): """Show majority label metrics of the test dataset using a previous result set. :param res_id: the result ID or use the last if not given """ with dealloc(self.create_facade()) as facade: pred_factory: PredictionsDataFrameFactory = \ facade.get_predictions_factory(name=res_id) pred_factory.majority_label_metrics.write()
def result(self, res_id: str = None): """Show the last results. :param res_id: the result ID or use the last if not given """ with dealloc(self.create_facade()) as facade: df_fac: PredictionsDataFrameFactory = \ facade.get_predictions_factory(name=res_id) df_fac.result.write()
def test(self, model_path: Path = None): """Test an existing model the model and dump the results of the test. :param model_path: the path to the model or use the last trained model if not provided """ self.model_path = model_path with dealloc(self.create_facade()) as facade: facade.test()
def load(): Deallocatable.ALLOCATION_TRACKING = True from pathlib import Path path = Path('target/iris/model') with dealloc(IrisModelFacade.load_from_path(path)) as facade: facade.reload() facade.writer = None res = facade.test() res.write(include_converged=True) facade.plot_result(save=True)
def compare_results(self, res_id_a: str, res_id_b: str): """Compare two previous archived result sets. :param res_id_a: the first result ID to compare :param res_id_b: the second result ID to compare """ with dealloc(self.create_facade()) as facade: rm: ModelResultComparer = facade.result_manager diff = ModelResultComparer(rm, res_id_a, res_id_b) diff.write()
def predict_text(self, text_input: str, verbose: bool = False): """Classify ad-hoc text and output the results.. :param text_input: the sentence to classify or standard in if not given :param verbose: if given, print the long format version of the document """ sents = self._get_sentences(text_input) with dealloc(self.create_facade()) as facade: docs: Tuple[FeatureDocument] = facade.predict(sents) for doc in docs: if verbose: doc.write() else: print(doc)
def predict_text(self, text_input: str, verbose: bool = False): """Classify ad-hoc text and output the results.. :param text_input: the sentence to classify or standard in if not given :param verbose: if given, print the long format version of the document """ sents = self._get_sentences(text_input) with dealloc(self.create_facade()) as facade: pred: Settings = facade.predict(sents) docs: Tuple[FeatureDocument] = pred.docs classes: Tuple[str] = pred.classes for labels, doc in zip(classes, docs): for label, tok in zip(labels, doc.token_iter()): print(label, tok)
def result_summary(self, out_file: Path = None, include_validation: bool = False): """Create a summary of all archived results. :param out_file: the output path :param validation: whether or not to include validation results """ if out_file is None: out_file = Path('result-summary.csv') with dealloc(self.create_facade()) as facade: rm: ModelResultManager = facade.result_manager self._enable_cli_logging(facade) reporter = ModelResultReporter(rm) reporter.include_validation = include_validation reporter.dump(out_file)
def predict(self, sentence: str): """Predict several movie review test sentences. :param sentence: the sentence to classify """ if sentence is None: sents = [ "If you sometimes like to go to the movies to have fun , Wasabi is a good place to start .", 'There are a few stabs at absurdist comedy ... but mostly the humor is of the sweet , gentle and occasionally cloying kind that has become an Iranian specialty .', 'Terrible', 'Great movie', 'Wonderful, great, awesome, 100%', 'Terrible, aweful, worst movie' ] else: sents = [sentence] with dealloc(self.create_facade()) as facade: docs: Tuple[Review] = facade.predict(sents) for doc in docs: doc.write()
def metrics(self, sort: str = 'wF1', res_id: str = None, out_file: Path = None): """Write a spreadhseet of label performance metrics for a previously trained and tested model. :param sort_col: the column to sort results :param res_id: the result ID or use the last if not given :param out_file: the output path """ if out_file is None: out_file = Path('metrics.csv') with dealloc(self.create_facade()) as facade: df = facade.get_predictions_factory(name=res_id).metrics_dataframe df = df.sort_values(sort, ascending=False).reset_index(drop=True) df.to_csv(out_file) self._enable_cli_logging(facade) logger.info(f'wrote: {out_file}')
def predictions(self, res_id: str = None, out_file: Path = None): """Write predictions to a CSV file. :param res_id: the result ID or use the last if not given :param out_file: the output path """ with dealloc(self.create_facade()) as facade: if out_file is None: out_file = Path(f'{facade.executor.model_name}.csv') try: df = facade.get_predictions(name=res_id) except ModelError as e: raise ApplicationError( 'Could not predict, probably need to train a model ' + f'first: {e}') from e df.to_csv(out_file) self._enable_cli_logging(facade) if logger.isEnabledFor(logging.INFO): logger.info(f'wrote predictions: {out_file}')
def batch_sample(self): """Print what's contained in this app specific batch. """ import numpy as np with dealloc(self.create_facade()) as facade: stash: BatchStash = facade.batch_stash batch: Batch for batch in it.islice(stash.values(), 3): classes = batch.get_label_classes() uks = np.unique(np.array(classes)) if len(uks) > 1 or True: print(batch.split_name) batch.write() print(classes) print(batch.has_labels) for dp in batch.data_points: if len(dp.doc) > 1: print(dp.doc.polarity) for s in dp.doc: print(s) print('-' * 30)
def batch(self, limit: int = None, clear_type: ClearType = ClearType.none, split: bool = False): """Create batches if not already, print statistics on the dataset. :param clear_type: what to delete to force recreate :param limit: the number of batches to create :param split: also write the stratified splits if available """ with dealloc(self.create_facade()) as facade: self._enable_cli_logging(facade) if clear_type == ClearType.batch: logger.info('clearing batches') facade.batch_stash.clear() elif clear_type == ClearType.source: facade.batch_stash.clear_all() facade.batch_stash.clear() facade.dataset_stash.write() if split: self._write_batch_splits(facade)
def assert_label_mapping(self): """Confirm the the mapping of the labels is correct.""" with dealloc(self.create_facade()) as facade: facade.assert_label_mapping()
def stats(self): """Print out the corpus statistics.""" with dealloc(self.create_facade()) as facade: facade.write_corpus_stats()
def result_ids(self): """Show all archived result IDs.""" with dealloc(self.create_facade()) as facade: rm: ModelResultManager = facade.result_manager print('\n'.join(rm.results_stash.keys()))
def early_stop(self): """Stops the execution of training the model. """ with dealloc(self.create_facade()) as facade: facade.stop_training()