Exemple #1
0
 def __post_init__(self):
     super().__post_init__()
     if self.encode_transformed and self.embed_model.trainable:
         # once the transformer last hidden state is dumped during encode
         # the parameters are lost, which are needed to train the model
         # properly
         raise VectorizerError('a trainable model can not encode ' +
                               'transformed vectorized features')
Exemple #2
0
 def _validate(self):
     if not self._validated:
         for vec in self.delegates:
             if hasattr(vec, 'feature_tye') and \
                vec.feature_type != TextFeatureType.TOKEN:
                 raise VectorizerError('Only token level vectorizers are ' +
                                       f'supported, but got {vec}')
     self._validated = True
Exemple #3
0
    def _assert_decoded_doc_dim(self, arr: Tensor, expect: int):
        """Check the decoded document dimesion and rase an error for those that do not
        match.

        """
        if len(arr.size()) != expect:
            raise VectorizerError(f'Expecting {expect} tensor dimensions, ' +
                                  f'but got shape: {arr.shape}')
Exemple #4
0
 def _combine_documents(self, docs: Tuple[FeatureDocument]) -> \
         FeatureDocument:
     if self.fold_method == 'raise' and len(docs) > 1:
         raise VectorizerError(
             f'Configured to support single document but got {len(docs)}')
     concat_tokens = self.fold_method == 'concat_tokens'
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'foldl method: {self.fold_method}, ' +
                      f'concat_tokens={concat_tokens}')
     return FeatureDocument.combine_documents(
         docs, concat_tokens=concat_tokens)
Exemple #5
0
    def _assert_doc(self, doc: Union[Tuple[FeatureDocument], FeatureDocument]):
        """Raise an error if any input is not a :class:`.FeatureDocument`.

        :raises: :class:`.VectorizerError` if any input isn't a document

        """
        if self._is_mult(doc):
            docs = doc
            for doc in docs:
                self._assert_doc(doc)
        elif not isinstance(doc, FeatureDocument):
            raise VectorizerError(
                f'Expecting document, but got type: {type(doc)}')
Exemple #6
0
 def __post_init__(self):
     super().__post_init__()
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug('creating fd vec manager')
     if self.token_feature_ids is None:
         self.token_feature_ids = self.doc_parser.token_feature_ids
     else:
         feat_diff = self.token_feature_ids - self.doc_parser.token_feature_ids
         if len(feat_diff) > 0:
             fdiffs = ', '.join(feat_diff)
             raise VectorizerError(
                 'Parser token features do not exist in vectorizer: ' +
                 f'{self.token_feature_ids} - ' +
                 f'{self.doc_parser.token_feature_ids} = {fdiffs}')
     self._spacy_vectorizers = PersistedWork('_spacy_vectorizers', self)
Exemple #7
0
 def encode(self, doc: Union[Tuple[FeatureDocument], FeatureDocument]) -> \
         FeatureContext:
     ctx: FeatureContext
     if self.fold_method == 'concat_tokens' or \
        self.fold_method == 'sentence':
         ctx = super().encode(doc)
     elif self.fold_method == 'separate':
         self._assert_doc(doc)
         ctx = self._encode_sentences(doc)
     elif self.fold_method == 'raise':
         if self._is_mult(doc):
             raise VectorizerError(
                 f'Expecting single document but got: {len(doc)} documents')
         ctx = super().encode(doc)
     return ctx
Exemple #8
0
    def get_flattened_features_shape(self, attribs: Set[str]) -> Tuple[int]:
        """Return the shape if all vectorizers were used.

        """
        bmapping = self.batch_feature_mapping
        label_feature_id = bmapping.label_feature_id
        n_flat_neurons = 0
        for feature_id, v in self.items():
            _, field_map = bmapping.get_field_map_by_feature_id(feature_id)
            if field_map is None:
                s = f'no feature: {feature_id} in vectorizer {self.name}'
                raise VectorizerError(s)
            attr = field_map.attr
            if feature_id != label_feature_id and \
               (attribs is None or attr in attribs):
                n = reduce(operator.mul, filter(lambda n: n > 0, v.shape))
                n_flat_neurons += n
        return (n_flat_neurons, )
Exemple #9
0
    def _slice_by_attributes(self, arr: Tensor) -> Tensor:
        """Create a new tensor from column based slices of the encoded tensor for each
        specified feature id given in :obj:`decoded_feature_ids`.

        """
        keeps = set(self.decoded_feature_ids)
        col_start = 0
        tensors = []
        for fvec in self.manager.spacy_vectorizers.values():
            col_end = col_start + fvec.shape[1]
            fid = fvec.feature_id
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(f'type={fid}, to keep={keeps}')
            if fid in keeps:
                tensors.append(arr[:, :, col_start:col_end])
                keeps.remove(fid)
            col_start = col_end
        if len(keeps) > 0:
            raise VectorizerError(f'Unknown feature type IDs: {keeps}')
        sarr = torch.cat(tensors, dim=2)
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'slice dim: {sarr.shape}')
        return sarr
Exemple #10
0
 def _encode(self, doc: FeatureDocument) -> FeatureContext:
     slen = len(doc)
     tlen = self.manager.get_token_length(doc)
     attr = self.feature_attribute
     arr = self.torch_config.zeros((slen, tlen, self.shape[2]))
     doc_val = getattr(doc, attr) if self.level == 'document' else None
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'vectorizing: {attr} for token length: {tlen} ' +
                      f'in to {arr.shape}')
     for six, sent in enumerate(doc.sents):
         if self.level == 'document':
             feats = [doc_val] * len(sent)
         elif self.level == 'sentenece':
             sent_val = getattr(sent, attr)
             feats = [sent_val] * len(sent)
         elif self.level == 'token':
             feats = tuple(map(lambda s: getattr(s, attr), sent))
         else:
             raise VectorizerError(f'Unknown doc level: {self.level}')
         self._encode_cats(feats, arr[six])
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'vectorized: {len(doc)} sents in to {arr.shape}')
     return SparseTensorFeatureContext.instance(self.feature_id, arr,
                                                self.torch_config)
Exemple #11
0
    def _assert_token_output(self, expected: str = 'last_hidden_state'):
        if self.embed_model.output != expected:
            raise VectorizerError(f"""\
Expanders only work at the token level, so output such as \
`{expected}`, which provides an output for each token in the \
transformer embedding, is required, got: {self.embed_model.output}""")
Exemple #12
0
 def __post_init__(self):
     super().__post_init__()
     if self.delegate_feature_id is None:
         raise VectorizerError('Expected attribute: delegate_feature_id')
     self._assert_token_output()
Exemple #13
0
 def __post_init__(self):
     super().__post_init__()
     if self.fold_method not in self._FOLD_METHODS:
         raise VectorizerError(f'No such fold method: {self.fold_method}')