def deallocate(self): with time('deallocated attribute', logging.DEBUG): if self.state == 'd' or self.state == 't': attrs = self.attributes for arr in tuple(attrs.values()): del arr attrs.clear() del attrs self._decoded_state.deallocate() if hasattr(self, 'batch_stash'): del self.batch_stash if hasattr(self, 'data_point_ids'): del self.data_point_ids if hasattr(self, '_data_points'): Deallocatable._try_deallocate(self._data_points) del self._data_points with time('deallocated feature context', logging.DEBUG): if hasattr(self, '_feature_context_inst') and \ self._feature_context_inst is not None: for ctx in self._feature_context_inst.values(): self._try_deallocate(ctx) self._feature_context_inst.clear() del self._feature_context_inst self.state = 'k' super().deallocate() logger.debug(f'deallocated batch: {self.id}')
def _create_data(self) -> WordVectorModel: """Read the binary bcolz, vocabulary and index files from disk. """ self._assert_binary_vecs() meta = self.metadata if logger.isEnabledFor(logging.INFO): logger.info(f'reading binary vector file: {meta.bin_file}') with time('loaded {cnt} vectors'): with h5py.File(meta.bin_file, 'r') as f: ds: Dataset = f[self.DATASET_NAME] vectors: np.ndarray = ds[:] if logger.isEnabledFor(logging.DEBUG): logger.debug(f'word embedding type: {vectors.dtype}') with open(meta.words_file, 'rb') as f: words = pickle.load(f) with open(meta.idx_file, 'rb') as f: word2idx = pickle.load(f) cnt = len(word2idx) with time('prepared vectors'): unknown_vec: np.ndarray = np.expand_dims( np.zeros(self.dimension), axis=0) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'unknown type: {unknown_vec.dtype}') vectors: np.ndarray = np.concatenate((vectors, unknown_vec)) word2idx[self.UNKNOWN] = len(words) words.append(self.UNKNOWN) word2vec = {w: vectors[word2idx[w]] for w in words} return WordVectorModel(vectors, word2vec, words, word2idx)
def _create_data(self) -> WordVectorModel: logger.info('reading binary vector file') # https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4 if gensim.__version__[0] >= '4': logger.debug('using version 4') wv = self._get_model() words = wv.index_to_key else: logger.debug('using version 3') wv = self._get_model().wv words = wv.index2entity word2vec = {} word2idx = {} vectors = [] with time('created data structures'): for i, word in enumerate(words): word2idx[word] = i vec = wv[word] vectors.append(vec) word2vec[word] = vec vectors = np.array(vectors) unknown_vec = np.expand_dims(np.zeros(self.dimension), axis=0) vectors = np.concatenate((vectors, unknown_vec)) word2idx[self.UNKNOWN] = len(words) words.append(self.UNKNOWN) word2vec[self.UNKNOWN] = unknown_vec return WordVectorModel(vectors, word2vec, words, word2idx)
def predict(self, datas: Iterable[Any]) -> Any: """Make ad-hoc predictions on batches without labels, and return the results. :param datas: the data predict on, each as a separate element as a data point in a batch """ executor: ModelExecutor = self.executor ms: ModelSettings = self.model_settings if ms.prediction_mapper_name is None: raise ModelError( f'The model settings ({ms.name}) is not configured to create ' + "prediction batches: no set 'prediction_mapper'") pm: PredictionMapper = self.config_factory.new_instance( ms.prediction_mapper_name, datas, self.batch_stash) self._notify('predict_start') try: batches: List[Batch] = pm.batches if not executor.model_exists: executor.load() logger.info('predicting...') with time('predicted'): res: ModelResult = executor.predict(batches) eres: EpochResult = res.results[0] ret: Any = pm.map_results(eres) finally: self._notify('predict_end') pm.deallocate() return ret
def _write_vecs(self) -> np.ndarray: """Write the h5py binary files. Only when they do not exist on the files system already are they calculated and written. """ meta = self.metadata meta.bin_dir.mkdir(parents=True, exist_ok=True) words = [] word2idx = {} if logger.isEnabledFor(logging.INFO): logger.info(f'writing binary vectors {meta.source_path} ' + f'-> {meta.bin_dir}') shape = (meta.n_vocab, meta.dimension) if logger.isEnabledFor(logging.INFO): logger.info(f'creating h5py binary vec files with shape {shape}:') meta.write_to_log(logger, logging.INFO, 1) with time(f'wrote h5py to {meta.bin_file}'): with h5py.File(meta.bin_file, 'w') as f: dset: Dataset = f.create_dataset( self.DATASET_NAME, shape, dtype='float64') self._populate_vec_lines(words, word2idx, dset) with open(meta.words_file, 'wb') as f: pickle.dump(words[:], f) with open(meta.idx_file, 'wb') as f: pickle.dump(word2idx, f)
def _get_keyed_model(self) -> KeyedVectors: """Load a model from a pretrained word2vec model. """ if logger.isEnabledFor(logging.INFO): logger.info(f'loading keyed file: {self.path}') fname = str(self.path.absolute()) with time(f'loaded key model from {fname}'): return KeyedVectors.load_word2vec_format(fname, binary=True)
def _get_model(self) -> KeyedVectors: """The word2vec model. """ with time('loaded word2vec model'): if self.model_type == 'keyed': model = self._get_keyed_model() else: model = self._get_trained_model().wv return model
def load(self, name: str): with time('loaded batch {name} ({obj.split_name})'): obj = super().load(name) # add back the container of the batch to reconstitute the original # features and use the CUDA for tensor device transforms if obj is not None: if not hasattr(obj, 'batch_stash'): obj.batch_stash = self if (not hasattr(obj, 'batch_feature_mappings') or obj.batch_feature_mappings is None): self.populate_batch_feature_mapping(obj) return obj
def _save_checkpoint(self, checkpoint: Dict[str, Any], save_weights: bool): """Save the check point to disk. :param checkpoint: all model state (results, random seed, weights etc) :param save_weights: if ``True`` then save the weights to the weight file (in addition to the state to the state file) """ state_path, weight_path = self._get_paths(self.path) weights = {} for k in 'model_optim_state_dict model_state_dict'.split(): wval = checkpoint.pop(k, None) if save_weights and wval is None: raise ModelError( f'Missing checkpoint key while saving weights: {k}') weights[k] = wval self.path.mkdir(parents=True, exist_ok=True) if save_weights: with time(f'saved model weights to {weight_path}'): torch.save(weights, str(weight_path)) with time(f'saved model state to {state_path}'): torch.save(checkpoint, str(state_path))
def _feature_contexts(self) -> \ Dict[str, Dict[str, Union[FeatureContext, Tuple[FeatureContext]]]]: has_ctx = hasattr(self, '_feature_context_inst') if logger.isEnabledFor(logging.DEBUG): logger.debug(f'has feature contexts: {has_ctx}') if has_ctx: if self._feature_context_inst is None: raise BatchError('Bad state transition, null contexts') else: with time(f'encoded batch {self.id}'): self._feature_context_inst = self._encode() if logger.isEnabledFor(logging.INFO): logger.info(f'access context: (state={self.state}), num keys=' + f'{len(self._feature_context_inst.keys())}') return self._feature_context_inst
def _create_criterion(self) -> torch.optim.Optimizer: resolver = self.config_factory.class_resolver criterion_class_name = self.model_settings.criterion_class_name if logger.isEnabledFor(logging.DEBUG): logger.debug(f'criterion: {criterion_class_name}') criterion_class = resolver.find_class(criterion_class_name) with time('weighted classes'): class_weights = self.get_class_weights() if logger.isEnabledFor(logging.INFO): logger.info(f'using class weights: {class_weights}') if self.use_weighted_criterion: inst = criterion_class(weight=class_weights) else: inst = criterion_class() return inst
def _load_checkpoint(state_path: Path, weight_path: Path) -> \ Dict[str, Any]: if not state_path.exists(): raise ModelError(f'No such state file: {state_path}') if logger.isEnabledFor(logging.DEBUG): logger.debug(f'loading check point from: {state_path}') with time(f'loaded check point from {state_path}'): cp = torch.load(str(state_path)) if weight_path is not None: params = {} if not torch.cuda.is_available(): params['map_location'] = torch.device('cpu') weights = torch.load(str(weight_path), **params) cp.update(weights) return cp
def test(self, description: str = None) -> ModelResult: """Load the model from disk and test it. """ if self.debuged: raise ModelError('Testing is not allowed in debug mode') executor = self.executor executor.load() logger.info('testing...') self._notify('test_start', description) with time('tested'): res = executor.test(description) if self.writer is not None: res.write(writer=self.writer) self._notify('test_end', description) return res
def _gc(self, level: int): """Invoke the Python garbage collector if ``level`` is high enough. The *lower* the value of ``level``, the more often it will be run during training, testing and validation. :param level: if priority of the need to collect--the lower the more its needed """ if level <= self.model_settings.gc_level: if logger.isEnabledFor(logging.DEBUG): logger.debug('garbage collecting') self._notify('gc_start') with time('garbage collected', logging.DEBUG): gc.collect() self._notify('gc_end')
def train(self, description: str = None) -> ModelResult: """Train and test or just debug the model depending on the configuration. :param description: a description used in the results, which is useful when making incremental hyperparameter changes to the model """ executor = self.executor executor.reset() logger.info('training...') self._notify('train_start', description) with time('trained'): res = executor.train(description) self._notify('train_end', description) return res
def data(self) -> Dict[str, Any]: with time('parsed stats data'): tag = collections.defaultdict(lambda: 0) syn = collections.defaultdict(lambda: 0) ent = collections.defaultdict(lambda: 0) for sent in self.stash.values(): for tok in sent: tag[tok.tag_] += 1 syn[tok.syn_] += 1 ent[tok.ent_] += 1 return { 'features': { 'tag': dict(tag), 'syn': dict(syn), 'ent': dict(ent) } }
def worker(self) -> Iterable[Tuple[str, FeatureSentence]]: corp = [] split_keys = {} start = 0 for name in self.corpus_split_names: with time('parsed {slen} sentences ' + f'from {name}'): sents: List[NERFeatureSentence] = self._read_split(name) slen = len(sents) random.shuffle(sents) end = start + len(sents) keys = tuple(map(str, range(start, end))) assert (len(keys) == len(sents)) split_keys[name] = keys corp.extend(zip(keys, sents)) start = end self._worker_split_keys = split_keys return corp
def _get_decoded_state(self): """Decode the pickeled attriubtes after loaded by containing ``BatchStash`` and remove the context information to save memory. """ assert self.state == 'e' if logger.isEnabledFor(logging.DEBUG): logger.debug(f'decoding ctxs: {self._feature_context_inst.keys()}') assert self._feature_context_inst is not None with time(f'decoded batch {self.id}'): attribs = self._decode(self._feature_contexts) self._feature_contexts = None assert self._feature_context_inst is None self.state = 'd' if logger.isEnabledFor(logging.DEBUG): logger.debug(f'return decoded attributes: {attribs.keys()}') return attribs
def train_production(self, description: str = None) -> ModelResult: """Train on the training and test data sets, then test :param description: a description used in the results, which is useful when making incremental hyperparameter changes to the model """ executor = self.executor executor.reset() if self.writer is not None: executor.write(writer=self.writer) logger.info('training...') self._notify('train_production_start', description) with time('trained'): res = executor.train_production(description) self._notify('train_production_end', description) return res
def _get_keys_by_split(self) -> Dict[str, Tuple[str]]: """Return keys by split type (i.e. ``train`` vs ``test``) for only those keys available by the delegate backing stash. """ if logger.isEnabledFor(logging.DEBUG): logger.debug('creating in memory available keys data structure') with time('created key data structures', logging.DEBUG): delegate_keys = set(self.delegate.keys()) avail_kbs = OrderedDict() for split, keys in self.split_container.keys_by_split.items(): ks = list() for k in keys: if k in delegate_keys: ks.append(k) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'{split} has {len(ks)} keys') self._add_keys(split, avail_kbs, ks) return avail_kbs
def _create_model(self, docs: Iterable[FeatureDocument]) -> Any: if logger.isEnabledFor(logging.INFO): logger.info(f'creating {self.topics} topics') docs = tuple(map(lambda doc: self.feat_to_tokens(doc), docs)) id2word = corpora.Dictionary(docs) corpus = tuple(map(lambda doc: id2word.doc2bow(doc), docs)) rand_state = TorchConfig.get_random_seed() if rand_state is None: rand_state = 0 params = { 'corpus': corpus, 'id2word': id2word, 'num_topics': self.topics, 'random_state': rand_state, 'update_every': 1, 'chunksize': 100, 'passes': 10, 'alpha': 'auto', 'per_word_topics': True } with time(f'modeled {self.topics} acros {len(docs)} documents'): lda = LdaModel(**params) return {'lda': lda, 'corpus': corpus, 'id2word': id2word}
def _execute(self, sets_name: str, description: str, func: Callable, ds_src: tuple) -> bool: """Either train or test the model based on method ``func``. :param sets_name: the name of the data sets, which ``train`` or ``test`` :param func: the method to call to do the training or testing :param ds_src: a tuple of datasets in a form such as ``(train, validation, test)`` (see :meth:`_get_dataset_splits`) :return: ``True`` if training/testing was successful, otherwise `the an exception occured or early bail """ to_deallocate: List[Batch] = [] ds_dst: List[List[Batch]] = None batch_limit = self.model_settings.batch_limit biter = self.model_settings.batch_iteration if self.model_settings.cache_batches and biter == 'buffered': raise ModelError('Can not cache batches for batch ' + 'iteration setting \'buffered\'') if logger.isEnabledFor(logging.INFO): logger.info(f'batch iteration: {biter}, limit: {batch_limit}' + f', caching: {self.model_settings.cache_batches}' f', cached: {len(self.cached_batches)}') self._notify('execute_start', sets_name) self._gc(1) ds_dst = self.cached_batches.get(sets_name) if ds_dst is None: cnt = 0 with time('loaded {cnt} batches'): cnt, ds_dst = self._prepare_datasets(batch_limit, to_deallocate, ds_src) if self.model_settings.cache_batches: self.cached_batches[sets_name] = ds_dst if logger.isEnabledFor(logging.INFO): logger.info('train/test sets: ' + f'{" ".join(map(lambda l: str(len(l)), ds_dst))}') try: with time(f'executed {sets_name}'): func(*ds_dst) if description is not None: res_name = f'{self.model_result.index}: {description}' self.model_result.name = res_name return True except EarlyBailError as e: logger.warning(f'<{e}>') self.reset() return False finally: self._notify('execute_end', sets_name) self._train_manager.clear() if logger.isEnabledFor(logging.INFO): logger.info(f'deallocating {len(to_deallocate)} batches') for batch in to_deallocate: if logger.isEnabledFor(logging.DEBUG): logger.debug(f'deallocating: {batch}') batch.deallocate() self._gc(1) self.torch_config.empty_cache()
def _train(self, train: List[Batch], valid: List[Batch]): """Train the network model and record validation and training losses. Every time the validation loss shrinks, the model is saved to disk. """ n_epochs = self.model_settings.epochs # create network model, loss and optimization functions model = self._get_or_create_model() model = self.torch_config.to(model) self._model = model if logger.isEnabledFor(logging.INFO): logger.info(f'training model {type(model)} on {model.device} ' + f'for {n_epochs} epochs using ' + f'learning rate {self.model_settings.learning_rate}') criterion, optimizer, scheduler = self.criterion_optimizer_scheduler # create a second module manager for after epoch results if self.intermediate_results_path is not None: model_path = self.intermediate_results_path intermediate_manager = self._create_result_manager(model_path) intermediate_manager.file_pattern = '{prefix}.{ext}' else: intermediate_manager = None train_manager = self.train_manager action = UpdateAction.ITERATE_EPOCH # set up graphical progress bar exec_logger = logging.getLogger(__name__) if self.progress_bar and \ (exec_logger.level == 0 or exec_logger.level > logging.INFO) and \ (progress_logger.level == 0 or progress_logger.level > logging.INFO): pbar = tqdm(total=n_epochs, ncols=self.progress_bar_cols) else: pbar = None train_manager.start(optimizer, scheduler, n_epochs, pbar) self.model_result.train.start() self.model_result.validation.start() # epochs loop while action != UpdateAction.STOP: epoch: int = train_manager.current_epoch train_epoch_result = EpochResult(epoch, DatasetSplitType.train) valid_epoch_result = EpochResult(epoch, DatasetSplitType.validation) if progress_logger.isEnabledFor(logging.INFO): progress_logger.debug(f'training on epoch: {epoch}') self.model_result.train.append(train_epoch_result) self.model_result.validation.append(valid_epoch_result) # train ---- # prep model for training and train model.train() train_epoch_result.start() self._notify('train_start', {'epoch': epoch}) for batch in self._to_iter(train): if logger.isEnabledFor(logging.DEBUG): logger.debug(f'training on batch: {batch.id}') with time('trained batch', level=logging.DEBUG): self.batch_iterator.iterate(model, optimizer, criterion, batch, train_epoch_result, DatasetSplitType.train) self._gc(3) self._notify('train_end', {'epoch': epoch}) train_epoch_result.end() self._gc(2) # validate ---- # prep model for evaluation and evaluate ave_valid_loss = 0 model.eval() valid_epoch_result.start() self._notify('validation_start', {'epoch': epoch}) for batch in self._to_iter(valid): # forward pass: compute predicted outputs by passing inputs # to the model with torch.no_grad(): loss = self.batch_iterator.iterate( model, optimizer, criterion, batch, valid_epoch_result, DatasetSplitType.validation) ave_valid_loss += (loss.item() * batch.size()) self._gc(3) self._notify('validation_end', {'epoch': epoch}) valid_epoch_result.end() ave_valid_loss = ave_valid_loss / len(valid) self._gc(2) valid_loss_min, decreased = train_manager.update_loss( valid_epoch_result, train_epoch_result, ave_valid_loss) if decreased: self.model_manager._save_executor(self) if intermediate_manager is not None: inter_res = self.model_result.get_intermediate() intermediate_manager.save_text_result(inter_res) intermediate_manager.save_plot_result(inter_res) # look for indication of update or early stopping status = train_manager.get_status() action = status.action val_losses = train_manager.validation_loss_decreases if logger.isEnabledFor(logging.INFO): logger.info('final minimum validation ' + f'loss: {train_manager.valid_loss_min}, ' + f'{val_losses} decreases') if val_losses == 0: logger.warn('no validation loss decreases encountered, ' + 'so there was no model saved; model can not be tested') self.model_result.train.end() self.model_result.validation.end() self.model_manager._save_final_trained_results(self)