Exemple #1
0
 def rand_assert(self, iters, size, conf):
     for i in range(iters):
         should = torch.rand(size, dtype=conf.data_type)
         should = conf.to(should)
         ctx = SparseTensorFeatureContext.instance('some_feature_id',
                                                   should, conf)
         self.assertTensorEquals(should, conf.to(ctx.to_tensor(conf)))
Exemple #2
0
 def test_sparse(self):
     conf = self.conf
     should = [
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 1.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 10.50, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [2.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
         [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 13.20, 0.00, 0.00, 0.00]
     ]
     tarr = torch.tensor(should)
     ctx = SparseTensorFeatureContext.instance('afeattype', tarr, conf)
     should = conf.singleton(should, dtype=tarr.dtype)
     dense = ctx.to_tensor(conf)
     self.assertTensorEquals(should, dense)
Exemple #3
0
 def test_3d_int_mat(self):
     should = torch.randint(0, 5, (2, 7, 11))
     ctx = SparseTensorFeatureContext.instance('afeattype', should,
                                               self.conf)
     for m in ctx.sparse_arr:
         self.assertTrue(isinstance(m, csr_matrix))
     dense = ctx.to_tensor(self.conf)
     self.assertTensorEquals(should, dense)
     self.assertEqual(should.shape, dense.shape)
Exemple #4
0
 def encode(self, doc: Union[Tuple[FeatureDocument], FeatureDocument]) -> \
         FeatureContext:
     ctx: TensorFeatureContext
     if isinstance(doc, (tuple, list)):
         self._assert_doc(doc)
         docs = doc
         comb_doc = FeatureDocument.combine_documents(docs)
         n_toks = self.manager.get_token_length(comb_doc)
         arrs = tuple(
             map(lambda d: self._encode_doc(d.combine_sentences(), n_toks),
                 docs))
         arr = torch.cat(arrs, dim=0)
         arr = arr.unsqueeze(-1)
         ctx = SparseTensorFeatureContext.instance(self.feature_id, arr,
                                                   self.torch_config)
     else:
         ctx = super().encode(doc)
     return ctx
Exemple #5
0
 def _encode(self, doc: FeatureDocument) -> FeatureContext:
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'encoding doc: {doc}')
     sent_arrs = []
     for sent in doc.sents:
         if logger.isEnabledFor(logging.DEBUG):
             logger.debug(f'encoding sentence: {sent}')
         tok_arrs = []
         for fvec in self.manager.spacy_vectorizers.values():
             cnts: Tensor = self.get_feature_counts(sent, fvec)
             if logger.isEnabledFor(logging.DEBUG):
                 logger.debug(f'encoding with {fvec}')
             tok_arrs.append(cnts)
         sent_arrs.append(torch.cat(tok_arrs))
     arr = torch.stack(sent_arrs)
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'encoded shape: {arr.shape}')
     return SparseTensorFeatureContext.instance(self.feature_id, arr,
                                                self.torch_config)
Exemple #6
0
    def _encode(self, doc: FeatureDocument) -> FeatureContext:
        """Encode tokens found in the container by aggregating the spaCy vectorizers
        output.

        """
        arr = self.torch_config.zeros(self._get_shape_for_document(doc))
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'type array shape: {arr.shape}')
        sent: FeatureSentence
        for six, sent in enumerate(doc.sents):
            col_start = 0
            for fvec in self.manager.spacy_vectorizers.values():
                col_end = col_start + fvec.shape[1]
                self._populate_feature_vectors(sent, six, fvec, arr, col_start,
                                               col_end)
                col_start = col_end
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug(f'encoded array shape: {arr.shape}')
        return SparseTensorFeatureContext.instance(self.feature_id, arr,
                                                   self.torch_config)
Exemple #7
0
 def _encode(self, doc: FeatureDocument) -> FeatureContext:
     slen = len(doc)
     tlen = self.manager.get_token_length(doc)
     attr = self.feature_attribute
     arr = self.torch_config.zeros((slen, tlen, self.shape[2]))
     doc_val = getattr(doc, attr) if self.level == 'document' else None
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'vectorizing: {attr} for token length: {tlen} ' +
                      f'in to {arr.shape}')
     for six, sent in enumerate(doc.sents):
         if self.level == 'document':
             feats = [doc_val] * len(sent)
         elif self.level == 'sentenece':
             sent_val = getattr(sent, attr)
             feats = [sent_val] * len(sent)
         elif self.level == 'token':
             feats = tuple(map(lambda s: getattr(s, attr), sent))
         else:
             raise VectorizerError(f'Unknown doc level: {self.level}')
         self._encode_cats(feats, arr[six])
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'vectorized: {len(doc)} sents in to {arr.shape}')
     return SparseTensorFeatureContext.instance(self.feature_id, arr,
                                                self.torch_config)
Exemple #8
0
 def _encode(self, doc: FeatureDocument) -> FeatureContext:
     n_toks = self.manager.get_token_length(doc)
     arr = self._encode_doc(doc, n_toks)
     arr = arr.unsqueeze(-1)
     return SparseTensorFeatureContext.instance(self.feature_id, arr,
                                                self.torch_config)
Exemple #9
0
 def _to_sparse(self, arr: Tensor):
     return SparseTensorFeatureContext.to_sparse(arr)[0][0]