Exemple #1
0
 def deallocate(self):
     with time('deallocated attribute', logging.DEBUG):
         if self.state == 'd' or self.state == 't':
             attrs = self.attributes
             for arr in tuple(attrs.values()):
                 del arr
             attrs.clear()
             del attrs
         self._decoded_state.deallocate()
     if hasattr(self, 'batch_stash'):
         del self.batch_stash
     if hasattr(self, 'data_point_ids'):
         del self.data_point_ids
     if hasattr(self, '_data_points'):
         Deallocatable._try_deallocate(self._data_points)
         del self._data_points
     with time('deallocated feature context', logging.DEBUG):
         if hasattr(self, '_feature_context_inst') and \
            self._feature_context_inst is not None:
             for ctx in self._feature_context_inst.values():
                 self._try_deallocate(ctx)
             self._feature_context_inst.clear()
             del self._feature_context_inst
     self.state = 'k'
     super().deallocate()
     logger.debug(f'deallocated batch: {self.id}')
Exemple #2
0
    def _create_data(self) -> WordVectorModel:
        """Read the binary bcolz, vocabulary and index files from disk.

        """
        self._assert_binary_vecs()
        meta = self.metadata
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'reading binary vector file: {meta.bin_file}')
        with time('loaded {cnt} vectors'):
            with h5py.File(meta.bin_file, 'r') as f:
                ds: Dataset = f[self.DATASET_NAME]
                vectors: np.ndarray = ds[:]
                if logger.isEnabledFor(logging.DEBUG):
                    logger.debug(f'word embedding type: {vectors.dtype}')
            with open(meta.words_file, 'rb') as f:
                words = pickle.load(f)
            with open(meta.idx_file, 'rb') as f:
                word2idx = pickle.load(f)
            cnt = len(word2idx)
        with time('prepared vectors'):
            unknown_vec: np.ndarray = np.expand_dims(
                np.zeros(self.dimension), axis=0)
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(f'unknown type: {unknown_vec.dtype}')
            vectors: np.ndarray = np.concatenate((vectors, unknown_vec))
            word2idx[self.UNKNOWN] = len(words)
            words.append(self.UNKNOWN)
            word2vec = {w: vectors[word2idx[w]] for w in words}
        return WordVectorModel(vectors, word2vec, words, word2idx)
Exemple #3
0
 def _create_data(self) -> WordVectorModel:
     logger.info('reading binary vector file')
     # https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
     if gensim.__version__[0] >= '4':
         logger.debug('using version 4')
         wv = self._get_model()
         words = wv.index_to_key
     else:
         logger.debug('using version 3')
         wv = self._get_model().wv
         words = wv.index2entity
     word2vec = {}
     word2idx = {}
     vectors = []
     with time('created data structures'):
         for i, word in enumerate(words):
             word2idx[word] = i
             vec = wv[word]
             vectors.append(vec)
             word2vec[word] = vec
         vectors = np.array(vectors)
     unknown_vec = np.expand_dims(np.zeros(self.dimension), axis=0)
     vectors = np.concatenate((vectors, unknown_vec))
     word2idx[self.UNKNOWN] = len(words)
     words.append(self.UNKNOWN)
     word2vec[self.UNKNOWN] = unknown_vec
     return WordVectorModel(vectors, word2vec, words, word2idx)
Exemple #4
0
    def predict(self, datas: Iterable[Any]) -> Any:
        """Make ad-hoc predictions on batches without labels, and return the results.

        :param datas: the data predict on, each as a separate element as a data
                      point in a batch

        """
        executor: ModelExecutor = self.executor
        ms: ModelSettings = self.model_settings
        if ms.prediction_mapper_name is None:
            raise ModelError(
                f'The model settings ({ms.name}) is not configured to create '
                + "prediction batches: no set 'prediction_mapper'")
        pm: PredictionMapper = self.config_factory.new_instance(
            ms.prediction_mapper_name, datas, self.batch_stash)
        self._notify('predict_start')
        try:
            batches: List[Batch] = pm.batches
            if not executor.model_exists:
                executor.load()
            logger.info('predicting...')
            with time('predicted'):
                res: ModelResult = executor.predict(batches)
            eres: EpochResult = res.results[0]
            ret: Any = pm.map_results(eres)
        finally:
            self._notify('predict_end')
            pm.deallocate()
        return ret
Exemple #5
0
    def _write_vecs(self) -> np.ndarray:
        """Write the h5py binary files.  Only when they do not exist on the files
        system already are they calculated and written.

        """
        meta = self.metadata
        meta.bin_dir.mkdir(parents=True, exist_ok=True)
        words = []
        word2idx = {}
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'writing binary vectors {meta.source_path} ' +
                        f'-> {meta.bin_dir}')
        shape = (meta.n_vocab, meta.dimension)
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'creating h5py binary vec files with shape {shape}:')
            meta.write_to_log(logger, logging.INFO, 1)
        with time(f'wrote h5py to {meta.bin_file}'):
            with h5py.File(meta.bin_file, 'w') as f:
                dset: Dataset = f.create_dataset(
                    self.DATASET_NAME, shape, dtype='float64')
                self._populate_vec_lines(words, word2idx, dset)
        with open(meta.words_file, 'wb') as f:
            pickle.dump(words[:], f)
        with open(meta.idx_file, 'wb') as f:
            pickle.dump(word2idx, f)
Exemple #6
0
    def _get_keyed_model(self) -> KeyedVectors:
        """Load a model from a pretrained word2vec model.

        """
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'loading keyed file: {self.path}')
        fname = str(self.path.absolute())
        with time(f'loaded key model from {fname}'):
            return KeyedVectors.load_word2vec_format(fname, binary=True)
Exemple #7
0
    def _get_model(self) -> KeyedVectors:
        """The word2vec model.

        """
        with time('loaded word2vec model'):
            if self.model_type == 'keyed':
                model = self._get_keyed_model()
            else:
                model = self._get_trained_model().wv
            return model
Exemple #8
0
 def load(self, name: str):
     with time('loaded batch {name} ({obj.split_name})'):
         obj = super().load(name)
     # add back the container of the batch to reconstitute the original
     # features and use the CUDA for tensor device transforms
     if obj is not None:
         if not hasattr(obj, 'batch_stash'):
             obj.batch_stash = self
         if (not hasattr(obj, 'batch_feature_mappings') or obj.batch_feature_mappings is None):
             self.populate_batch_feature_mapping(obj)
     return obj
Exemple #9
0
    def _save_checkpoint(self, checkpoint: Dict[str, Any], save_weights: bool):
        """Save the check point to disk.

        :param checkpoint: all model state (results, random seed, weights etc)

        :param save_weights: if ``True`` then save the weights to the weight
                             file (in addition to the state to the state file)

        """
        state_path, weight_path = self._get_paths(self.path)
        weights = {}
        for k in 'model_optim_state_dict model_state_dict'.split():
            wval = checkpoint.pop(k, None)
            if save_weights and wval is None:
                raise ModelError(
                    f'Missing checkpoint key while saving weights: {k}')
            weights[k] = wval
        self.path.mkdir(parents=True, exist_ok=True)
        if save_weights:
            with time(f'saved model weights to {weight_path}'):
                torch.save(weights, str(weight_path))
        with time(f'saved model state to {state_path}'):
            torch.save(checkpoint, str(state_path))
Exemple #10
0
 def _feature_contexts(self) -> \
         Dict[str, Dict[str, Union[FeatureContext, Tuple[FeatureContext]]]]:
     has_ctx = hasattr(self, '_feature_context_inst')
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'has feature contexts: {has_ctx}')
     if has_ctx:
         if self._feature_context_inst is None:
             raise BatchError('Bad state transition, null contexts')
     else:
         with time(f'encoded batch {self.id}'):
             self._feature_context_inst = self._encode()
     if logger.isEnabledFor(logging.INFO):
         logger.info(f'access context: (state={self.state}), num keys=' +
                     f'{len(self._feature_context_inst.keys())}')
     return self._feature_context_inst
Exemple #11
0
 def _create_criterion(self) -> torch.optim.Optimizer:
     resolver = self.config_factory.class_resolver
     criterion_class_name = self.model_settings.criterion_class_name
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'criterion: {criterion_class_name}')
     criterion_class = resolver.find_class(criterion_class_name)
     with time('weighted classes'):
         class_weights = self.get_class_weights()
     if logger.isEnabledFor(logging.INFO):
         logger.info(f'using class weights: {class_weights}')
     if self.use_weighted_criterion:
         inst = criterion_class(weight=class_weights)
     else:
         inst = criterion_class()
     return inst
Exemple #12
0
 def _load_checkpoint(state_path: Path, weight_path: Path) -> \
         Dict[str, Any]:
     if not state_path.exists():
         raise ModelError(f'No such state file: {state_path}')
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'loading check point from: {state_path}')
     with time(f'loaded check point from {state_path}'):
         cp = torch.load(str(state_path))
     if weight_path is not None:
         params = {}
         if not torch.cuda.is_available():
             params['map_location'] = torch.device('cpu')
         weights = torch.load(str(weight_path), **params)
         cp.update(weights)
     return cp
Exemple #13
0
    def test(self, description: str = None) -> ModelResult:
        """Load the model from disk and test it.

        """
        if self.debuged:
            raise ModelError('Testing is not allowed in debug mode')
        executor = self.executor
        executor.load()
        logger.info('testing...')
        self._notify('test_start', description)
        with time('tested'):
            res = executor.test(description)
        if self.writer is not None:
            res.write(writer=self.writer)
        self._notify('test_end', description)
        return res
Exemple #14
0
    def _gc(self, level: int):
        """Invoke the Python garbage collector if ``level`` is high enough.  The
        *lower* the value of ``level``, the more often it will be run during
        training, testing and validation.

        :param level: if priority of the need to collect--the lower the more
                      its needed

        """
        if level <= self.model_settings.gc_level:
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug('garbage collecting')
            self._notify('gc_start')
            with time('garbage collected', logging.DEBUG):
                gc.collect()
            self._notify('gc_end')
Exemple #15
0
    def train(self, description: str = None) -> ModelResult:
        """Train and test or just debug the model depending on the configuration.

        :param description: a description used in the results, which is useful
                            when making incremental hyperparameter changes to
                            the model

        """
        executor = self.executor
        executor.reset()
        logger.info('training...')
        self._notify('train_start', description)
        with time('trained'):
            res = executor.train(description)
        self._notify('train_end', description)
        return res
Exemple #16
0
 def data(self) -> Dict[str, Any]:
     with time('parsed stats data'):
         tag = collections.defaultdict(lambda: 0)
         syn = collections.defaultdict(lambda: 0)
         ent = collections.defaultdict(lambda: 0)
         for sent in self.stash.values():
             for tok in sent:
                 tag[tok.tag_] += 1
                 syn[tok.syn_] += 1
                 ent[tok.ent_] += 1
         return {
             'features': {
                 'tag': dict(tag),
                 'syn': dict(syn),
                 'ent': dict(ent)
             }
         }
Exemple #17
0
 def worker(self) -> Iterable[Tuple[str, FeatureSentence]]:
     corp = []
     split_keys = {}
     start = 0
     for name in self.corpus_split_names:
         with time('parsed {slen} sentences ' + f'from {name}'):
             sents: List[NERFeatureSentence] = self._read_split(name)
             slen = len(sents)
         random.shuffle(sents)
         end = start + len(sents)
         keys = tuple(map(str, range(start, end)))
         assert (len(keys) == len(sents))
         split_keys[name] = keys
         corp.extend(zip(keys, sents))
         start = end
     self._worker_split_keys = split_keys
     return corp
Exemple #18
0
    def _get_decoded_state(self):
        """Decode the pickeled attriubtes after loaded by containing ``BatchStash`` and
        remove the context information to save memory.

        """
        assert self.state == 'e'
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'decoding ctxs: {self._feature_context_inst.keys()}')
        assert self._feature_context_inst is not None
        with time(f'decoded batch {self.id}'):
            attribs = self._decode(self._feature_contexts)
        self._feature_contexts = None
        assert self._feature_context_inst is None
        self.state = 'd'
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'return decoded attributes: {attribs.keys()}')
        return attribs
Exemple #19
0
    def train_production(self, description: str = None) -> ModelResult:
        """Train on the training and test data sets, then test

        :param description: a description used in the results, which is useful
                            when making incremental hyperparameter changes to
                            the model

        """
        executor = self.executor
        executor.reset()
        if self.writer is not None:
            executor.write(writer=self.writer)
        logger.info('training...')
        self._notify('train_production_start', description)
        with time('trained'):
            res = executor.train_production(description)
        self._notify('train_production_end', description)
        return res
Exemple #20
0
    def _get_keys_by_split(self) -> Dict[str, Tuple[str]]:
        """Return keys by split type (i.e. ``train`` vs ``test``) for only those keys
        available by the delegate backing stash.

        """
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug('creating in memory available keys data structure')
        with time('created key data structures', logging.DEBUG):
            delegate_keys = set(self.delegate.keys())
            avail_kbs = OrderedDict()
            for split, keys in self.split_container.keys_by_split.items():
                ks = list()
                for k in keys:
                    if k in delegate_keys:
                        ks.append(k)
                if logger.isEnabledFor(logging.DEBUG):
                    logger.debug(f'{split} has {len(ks)} keys')
                self._add_keys(split, avail_kbs, ks)
            return avail_kbs
Exemple #21
0
 def _create_model(self, docs: Iterable[FeatureDocument]) -> Any:
     if logger.isEnabledFor(logging.INFO):
         logger.info(f'creating {self.topics} topics')
     docs = tuple(map(lambda doc: self.feat_to_tokens(doc), docs))
     id2word = corpora.Dictionary(docs)
     corpus = tuple(map(lambda doc: id2word.doc2bow(doc), docs))
     rand_state = TorchConfig.get_random_seed()
     if rand_state is None:
         rand_state = 0
     params = {
         'corpus': corpus,
         'id2word': id2word,
         'num_topics': self.topics,
         'random_state': rand_state,
         'update_every': 1,
         'chunksize': 100,
         'passes': 10,
         'alpha': 'auto',
         'per_word_topics': True
     }
     with time(f'modeled {self.topics} acros {len(docs)} documents'):
         lda = LdaModel(**params)
     return {'lda': lda, 'corpus': corpus, 'id2word': id2word}
Exemple #22
0
    def _execute(self, sets_name: str, description: str, func: Callable,
                 ds_src: tuple) -> bool:
        """Either train or test the model based on method ``func``.

        :param sets_name: the name of the data sets, which ``train`` or
                          ``test``

        :param func: the method to call to do the training or testing

        :param ds_src: a tuple of datasets in a form such as ``(train,
                       validation, test)`` (see :meth:`_get_dataset_splits`)

        :return: ``True`` if training/testing was successful, otherwise
                 `the an exception occured or early bail

        """
        to_deallocate: List[Batch] = []
        ds_dst: List[List[Batch]] = None
        batch_limit = self.model_settings.batch_limit
        biter = self.model_settings.batch_iteration

        if self.model_settings.cache_batches and biter == 'buffered':
            raise ModelError('Can not cache batches for batch ' +
                             'iteration setting \'buffered\'')

        if logger.isEnabledFor(logging.INFO):
            logger.info(f'batch iteration: {biter}, limit: {batch_limit}' +
                        f', caching: {self.model_settings.cache_batches}'
                        f', cached: {len(self.cached_batches)}')

        self._notify('execute_start', sets_name)

        self._gc(1)

        ds_dst = self.cached_batches.get(sets_name)
        if ds_dst is None:
            cnt = 0
            with time('loaded {cnt} batches'):
                cnt, ds_dst = self._prepare_datasets(batch_limit,
                                                     to_deallocate, ds_src)
            if self.model_settings.cache_batches:
                self.cached_batches[sets_name] = ds_dst

        if logger.isEnabledFor(logging.INFO):
            logger.info('train/test sets: ' +
                        f'{" ".join(map(lambda l: str(len(l)), ds_dst))}')

        try:
            with time(f'executed {sets_name}'):
                func(*ds_dst)
            if description is not None:
                res_name = f'{self.model_result.index}: {description}'
                self.model_result.name = res_name
            return True
        except EarlyBailError as e:
            logger.warning(f'<{e}>')
            self.reset()
            return False
        finally:
            self._notify('execute_end', sets_name)
            self._train_manager.clear()
            if logger.isEnabledFor(logging.INFO):
                logger.info(f'deallocating {len(to_deallocate)} batches')
            for batch in to_deallocate:
                if logger.isEnabledFor(logging.DEBUG):
                    logger.debug(f'deallocating: {batch}')
                batch.deallocate()
            self._gc(1)
            self.torch_config.empty_cache()
Exemple #23
0
    def _train(self, train: List[Batch], valid: List[Batch]):
        """Train the network model and record validation and training losses.  Every
        time the validation loss shrinks, the model is saved to disk.

        """
        n_epochs = self.model_settings.epochs
        # create network model, loss and optimization functions
        model = self._get_or_create_model()
        model = self.torch_config.to(model)
        self._model = model
        if logger.isEnabledFor(logging.INFO):
            logger.info(f'training model {type(model)} on {model.device} ' +
                        f'for {n_epochs} epochs using ' +
                        f'learning rate {self.model_settings.learning_rate}')
        criterion, optimizer, scheduler = self.criterion_optimizer_scheduler
        # create a second module manager for after epoch results
        if self.intermediate_results_path is not None:
            model_path = self.intermediate_results_path
            intermediate_manager = self._create_result_manager(model_path)
            intermediate_manager.file_pattern = '{prefix}.{ext}'
        else:
            intermediate_manager = None
        train_manager = self.train_manager
        action = UpdateAction.ITERATE_EPOCH
        # set up graphical progress bar
        exec_logger = logging.getLogger(__name__)
        if self.progress_bar and \
            (exec_logger.level == 0 or
             exec_logger.level > logging.INFO) and \
            (progress_logger.level == 0 or
             progress_logger.level > logging.INFO):
            pbar = tqdm(total=n_epochs, ncols=self.progress_bar_cols)
        else:
            pbar = None

        train_manager.start(optimizer, scheduler, n_epochs, pbar)
        self.model_result.train.start()
        self.model_result.validation.start()

        # epochs loop
        while action != UpdateAction.STOP:
            epoch: int = train_manager.current_epoch
            train_epoch_result = EpochResult(epoch, DatasetSplitType.train)
            valid_epoch_result = EpochResult(epoch,
                                             DatasetSplitType.validation)

            if progress_logger.isEnabledFor(logging.INFO):
                progress_logger.debug(f'training on epoch: {epoch}')

            self.model_result.train.append(train_epoch_result)
            self.model_result.validation.append(valid_epoch_result)

            # train ----
            # prep model for training and train
            model.train()
            train_epoch_result.start()
            self._notify('train_start', {'epoch': epoch})
            for batch in self._to_iter(train):
                if logger.isEnabledFor(logging.DEBUG):
                    logger.debug(f'training on batch: {batch.id}')
                with time('trained batch', level=logging.DEBUG):
                    self.batch_iterator.iterate(model, optimizer, criterion,
                                                batch, train_epoch_result,
                                                DatasetSplitType.train)
                self._gc(3)
            self._notify('train_end', {'epoch': epoch})
            train_epoch_result.end()

            self._gc(2)

            # validate ----
            # prep model for evaluation and evaluate
            ave_valid_loss = 0
            model.eval()
            valid_epoch_result.start()
            self._notify('validation_start', {'epoch': epoch})
            for batch in self._to_iter(valid):
                # forward pass: compute predicted outputs by passing inputs
                # to the model
                with torch.no_grad():
                    loss = self.batch_iterator.iterate(
                        model, optimizer, criterion, batch, valid_epoch_result,
                        DatasetSplitType.validation)
                    ave_valid_loss += (loss.item() * batch.size())
                self._gc(3)
            self._notify('validation_end', {'epoch': epoch})
            valid_epoch_result.end()
            ave_valid_loss = ave_valid_loss / len(valid)

            self._gc(2)

            valid_loss_min, decreased = train_manager.update_loss(
                valid_epoch_result, train_epoch_result, ave_valid_loss)

            if decreased:
                self.model_manager._save_executor(self)
                if intermediate_manager is not None:
                    inter_res = self.model_result.get_intermediate()
                    intermediate_manager.save_text_result(inter_res)
                    intermediate_manager.save_plot_result(inter_res)

            # look for indication of update or early stopping
            status = train_manager.get_status()
            action = status.action

        val_losses = train_manager.validation_loss_decreases
        if logger.isEnabledFor(logging.INFO):
            logger.info('final minimum validation ' +
                        f'loss: {train_manager.valid_loss_min}, ' +
                        f'{val_losses} decreases')

        if val_losses == 0:
            logger.warn('no validation loss decreases encountered, ' +
                        'so there was no model saved; model can not be tested')

        self.model_result.train.end()
        self.model_result.validation.end()
        self.model_manager._save_final_trained_results(self)