Ejemplo n.º 1
0
    def __init__(
        self,
        positive_label: str,
        context_window: int,
        feature_extractor: FeatureExtractor,
        feature_summarizer: Callable[[List[Any]], torch.Tensor] = FeatureCollator.sum,
        use_batch: bool = True,
        threshold: Optional[float] = 0.7,
        parallelize: bool = False, # shared memory issue locally
        use_sparse: bool = False, # store dictionary as sparse matrix
        **kwargs,
    ):
        self.positive_label = positive_label
        self.feature_extractor = feature_extractor
        self.context_window = context_window
        self.parallelize = parallelize
        super(BagWindowFunction, self).__init__(
            positive_label,
            feature_extractor,
            context_window,
            use_batch=use_batch,
            threshold=threshold,
            **kwargs,
        )

        self.dictionary = SparseTensorList() if use_sparse else TensorList()
        self.labels = TensorList()
        self.feature_summarizer = feature_summarizer
Ejemplo n.º 2
0
    def test_tensor(self):
        def _create_list():
            return [
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
            ]

        tl = SparseTensorList(tensor_list=_create_list())
        assert type(tl.tensor()) == torch.Tensor
Ejemplo n.º 3
0
    def test_numpy(self):
        def _create_list():
            return [
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
            ]

        tl = SparseTensorList(tensor_list=_create_list())
        assert type(tl.numpy()) == np.ndarray
Ejemplo n.º 4
0
 def test_append(self):
     tl = SparseTensorList(tensor_list=[
         torch.zeros(1, TENSOR_EMBEDDING_DIM),
         torch.zeros(1, TENSOR_EMBEDDING_DIM),
         torch.zeros(1, TENSOR_EMBEDDING_DIM),
     ])
     assert len(tl) == 3
     assert tl.shape == (3, TENSOR_EMBEDDING_DIM)
     tl.append(np.zeros((1, TENSOR_EMBEDDING_DIM)))
     assert len(tl) == 4
     assert tl.shape == (4, TENSOR_EMBEDDING_DIM)
Ejemplo n.º 5
0
    def test_extend(self):
        def _create_list():
            return [
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
            ]

        tl = SparseTensorList(tensor_list=_create_list())
        assert len(tl) == 3
        assert tl.shape == (3, TENSOR_EMBEDDING_DIM)
        tl.extend(_create_list())
        assert len(tl) == 6
        assert tl.shape == (6, TENSOR_EMBEDDING_DIM)
Ejemplo n.º 6
0
    def test_contains_tensor(self):
        def _create_list():
            return [
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
            ]

        list_tensor = _create_list()
        tl = SparseTensorList(tensor_list=list_tensor)
        found_index = tl.contains(torch.zeros(1, TENSOR_EMBEDDING_DIM))
        assert found_index == 0

        found_index = tl.contains(torch.zeros(1, TENSOR_EMBEDDING_DIM) + 1)
        assert found_index == -1
Ejemplo n.º 7
0
    def test_tensor_list(self):
        def _create_list():
            return [
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
            ]

        list_tensor = _create_list()
        tl = SparseTensorList(tensor_list=list_tensor)
        try:
            created_list = tl.to_list()
            raise Exception(f'to list is not supported for SparseTensorList')
        except ValueError as e:
            pass
Ejemplo n.º 8
0
 def test_constructor_numpy(self):
     tl = SparseTensorList(tensor_list=[
         np.zeros((1, TENSOR_EMBEDDING_DIM)),
         np.zeros((1, TENSOR_EMBEDDING_DIM)),
         np.zeros((1, TENSOR_EMBEDDING_DIM)),
     ])
     assert len(tl) == 3
     assert tl.shape == (3, TENSOR_EMBEDDING_DIM)
Ejemplo n.º 9
0
 def test_constructor_tensor(self):
     tl = SparseTensorList(tensor_list=[
         torch.zeros(1, TENSOR_EMBEDDING_DIM),
         torch.zeros(1, TENSOR_EMBEDDING_DIM),
         torch.zeros(1, TENSOR_EMBEDDING_DIM),
     ])
     assert len(tl) == 3
     assert tl.shape == (3, TENSOR_EMBEDDING_DIM)
Ejemplo n.º 10
0
    def test_correct_vstack(self):
        def _create_list():
            return [
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
                torch.zeros(1, TENSOR_EMBEDDING_DIM),
            ]

        list_tensor = _create_list()
        tl = SparseTensorList(tensor_list=list_tensor)
        # calls vstack, make sure result is operable
        tl.extend(_create_list())
        found_index = tl.contains(torch.zeros(1, TENSOR_EMBEDDING_DIM))
        assert found_index == 0

        found_index = tl.contains(torch.zeros(1, TENSOR_EMBEDDING_DIM) + 1)
        assert found_index == -1
Ejemplo n.º 11
0
 def test_empty_construct(self):
     tl = SparseTensorList()
     assert len(tl) == 0
     # TODO: think of fix, empty csr_matrix
     # is constructed with shape (1, 0)
     assert tl.shape == (1, 0)
Ejemplo n.º 12
0
class BagWindowFunction(WindowFunction):
    def __init__(
        self,
        positive_label: str,
        context_window: int,
        feature_extractor: FeatureExtractor,
        feature_summarizer: Callable[[List[Any]], torch.Tensor] = FeatureCollator.sum,
        use_batch: bool = True,
        threshold: Optional[float] = 0.7,
        parallelize: bool = False, # shared memory issue locally
        use_sparse: bool = False, # store dictionary as sparse matrix
        **kwargs,
    ):
        self.positive_label = positive_label
        self.feature_extractor = feature_extractor
        self.context_window = context_window
        self.parallelize = parallelize
        super(BagWindowFunction, self).__init__(
            positive_label,
            feature_extractor,
            context_window,
            use_batch=use_batch,
            threshold=threshold,
            **kwargs,
        )

        self.dictionary = SparseTensorList() if use_sparse else TensorList()
        self.labels = TensorList()
        self.feature_summarizer = feature_summarizer
    
    def _train_model(self, training_data: List[Tuple[List[str], List[Any], str]]):
        for i, (sentence_window, feature_window, label) in enumerate(training_data):
            if is_negative(label):
                continue
            window_summary = self.feature_summarizer(feature_window)
            self.dictionary.append(window_summary.float())
            self.labels.append(torch.Tensor([label_index(label)]))
    
    def _predict(self, features: List[torch.Tensor]) -> int:
        feature_summary = self.feature_summarizer(features)
        labels = self.labels.tensor().long()
        found_index = self.dictionary.contains(feature_summary)
        if found_index == -1:
            return 0 # no confidence (should be ABSTAIN)
        label = labels[found_index]
        return label.item()
    
    def _predict_probabilities(self, features: List[torch.Tensor]) -> float:
        feature_summary = self.feature_summarizer(features)
        labels = self.labels.tensor().long()
        found_index = self.dictionary.contains(feature_summary)
        if found_index == -1:
            return 0. # no confidence (should be ABSTAIN)
        label = labels[found_index]
        return 2 * label.item() - 1 # (0 -> -1 ,1 -> 1)
    
    def _batch_predict(self, features: List[List[torch.Tensor]]) -> List[int]:
        return list(map(lambda f: self._predict(f), features))
    
    def _batch_probabilities(self, features: List[List[torch.Tensor]]) -> List[float]:
        if self.parallelize:
            pool = multiprocessing.Pool()
            self.dictionary.share_memory()
            self.labels.share_memory()
            parallel_res = pool.map(self._predict_probabilities, features)
            return list(parallel_res)
        else:
            return list(map(self._predict_probabilities, features))
    
    @overrides
    def __str__(self):
        return f'BagWindowFunction({self.context_window})({self.feature_extractor})'