def rand_assert(self, iters, size, conf): for i in range(iters): should = torch.rand(size, dtype=conf.data_type) should = conf.to(should) ctx = SparseTensorFeatureContext.instance('some_feature_id', should, conf) self.assertTensorEquals(should, conf.to(ctx.to_tensor(conf)))
def test_sparse(self): conf = self.conf should = [ [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 1.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 10.50, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [2.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 1.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 13.20, 0.00, 0.00, 0.00] ] tarr = torch.tensor(should) ctx = SparseTensorFeatureContext.instance('afeattype', tarr, conf) should = conf.singleton(should, dtype=tarr.dtype) dense = ctx.to_tensor(conf) self.assertTensorEquals(should, dense)
def test_3d_int_mat(self): should = torch.randint(0, 5, (2, 7, 11)) ctx = SparseTensorFeatureContext.instance('afeattype', should, self.conf) for m in ctx.sparse_arr: self.assertTrue(isinstance(m, csr_matrix)) dense = ctx.to_tensor(self.conf) self.assertTensorEquals(should, dense) self.assertEqual(should.shape, dense.shape)
def encode(self, doc: Union[Tuple[FeatureDocument], FeatureDocument]) -> \ FeatureContext: ctx: TensorFeatureContext if isinstance(doc, (tuple, list)): self._assert_doc(doc) docs = doc comb_doc = FeatureDocument.combine_documents(docs) n_toks = self.manager.get_token_length(comb_doc) arrs = tuple( map(lambda d: self._encode_doc(d.combine_sentences(), n_toks), docs)) arr = torch.cat(arrs, dim=0) arr = arr.unsqueeze(-1) ctx = SparseTensorFeatureContext.instance(self.feature_id, arr, self.torch_config) else: ctx = super().encode(doc) return ctx
def _encode(self, doc: FeatureDocument) -> FeatureContext: if logger.isEnabledFor(logging.DEBUG): logger.debug(f'encoding doc: {doc}') sent_arrs = [] for sent in doc.sents: if logger.isEnabledFor(logging.DEBUG): logger.debug(f'encoding sentence: {sent}') tok_arrs = [] for fvec in self.manager.spacy_vectorizers.values(): cnts: Tensor = self.get_feature_counts(sent, fvec) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'encoding with {fvec}') tok_arrs.append(cnts) sent_arrs.append(torch.cat(tok_arrs)) arr = torch.stack(sent_arrs) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'encoded shape: {arr.shape}') return SparseTensorFeatureContext.instance(self.feature_id, arr, self.torch_config)
def _encode(self, doc: FeatureDocument) -> FeatureContext: """Encode tokens found in the container by aggregating the spaCy vectorizers output. """ arr = self.torch_config.zeros(self._get_shape_for_document(doc)) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'type array shape: {arr.shape}') sent: FeatureSentence for six, sent in enumerate(doc.sents): col_start = 0 for fvec in self.manager.spacy_vectorizers.values(): col_end = col_start + fvec.shape[1] self._populate_feature_vectors(sent, six, fvec, arr, col_start, col_end) col_start = col_end if logger.isEnabledFor(logging.DEBUG): logger.debug(f'encoded array shape: {arr.shape}') return SparseTensorFeatureContext.instance(self.feature_id, arr, self.torch_config)
def _encode(self, doc: FeatureDocument) -> FeatureContext: slen = len(doc) tlen = self.manager.get_token_length(doc) attr = self.feature_attribute arr = self.torch_config.zeros((slen, tlen, self.shape[2])) doc_val = getattr(doc, attr) if self.level == 'document' else None if logger.isEnabledFor(logging.DEBUG): logger.debug(f'vectorizing: {attr} for token length: {tlen} ' + f'in to {arr.shape}') for six, sent in enumerate(doc.sents): if self.level == 'document': feats = [doc_val] * len(sent) elif self.level == 'sentenece': sent_val = getattr(sent, attr) feats = [sent_val] * len(sent) elif self.level == 'token': feats = tuple(map(lambda s: getattr(s, attr), sent)) else: raise VectorizerError(f'Unknown doc level: {self.level}') self._encode_cats(feats, arr[six]) if logger.isEnabledFor(logging.DEBUG): logger.debug(f'vectorized: {len(doc)} sents in to {arr.shape}') return SparseTensorFeatureContext.instance(self.feature_id, arr, self.torch_config)
def _encode(self, doc: FeatureDocument) -> FeatureContext: n_toks = self.manager.get_token_length(doc) arr = self._encode_doc(doc, n_toks) arr = arr.unsqueeze(-1) return SparseTensorFeatureContext.instance(self.feature_id, arr, self.torch_config)
def _to_sparse(self, arr: Tensor): return SparseTensorFeatureContext.to_sparse(arr)[0][0]