def build_index(cls, sentence_embedder, dataset: UnlabeledBIODataset) -> TensorList:
     index = TensorList()
     for inst in dataset:
         sentence_embedding: torch.Tensor = sentence_embedder(
             sentence_ids=torch.Tensor([inst['id']]),
             dataset_ids=torch.Tensor([dataset.dataset_id]),
         )
         index.append(sentence_embedding)
     return index
Exemple #2
0
 def test_append(self):
     tl = TensorList(tensor_list=[
         torch.zeros(1, TENSOR_EMBEDDING_DIM),
         torch.zeros(1, TENSOR_EMBEDDING_DIM),
         torch.zeros(1, TENSOR_EMBEDDING_DIM),
     ])
     assert len(tl) == 3
     assert tl.shape == (3, TENSOR_EMBEDDING_DIM)
     tl.append(np.zeros((1, TENSOR_EMBEDDING_DIM)))
     assert len(tl) == 4
     assert tl.shape == (4, TENSOR_EMBEDDING_DIM)
Exemple #3
0
class LinearWindowFunction(WindowFunction):
    def __init__(
        self,
        positive_label: str,
        context_window: int,
        feature_extractor: FeatureExtractor,
        feature_summarizer: Callable[[List[Any]], torch.Tensor] = FeatureCollator.sum,
        linear_type: LinearType = LinearType.SVM_LINEAR,
        use_batch: bool = True,
        threshold: Optional[float] = 0.7,
        **kwargs,
    ):
        self.positive_label = positive_label
        self.feature_extractor = feature_extractor
        self.context_window = context_window
        super(LinearWindowFunction, self).__init__(
            positive_label,
            feature_extractor,
            context_window,
            use_batch=use_batch,
            threshold=threshold,
            **kwargs,
        )

        self.dictionary = TensorList()
        self.labels = TensorList()
        self.feature_summarizer = feature_summarizer
        self.linear_model = construct_linear_classifier(linear_type=linear_type)
    
    @log_time(function_prefix='linear_window_train')
    def _train_model(self, training_data: List[Tuple[List[str], List[Any], str]]):
        for i, (sentence_window, feature_window, label) in enumerate(training_data):
            window_summary = self.feature_summarizer(feature_window)
            self.dictionary.append(window_summary)
            self.labels.append(torch.Tensor([label_index(label)]))
        x_train = self.dictionary.numpy()
        y_train = self.labels.numpy()
        x_train, y_train = balance_dataset(x_train, y_train)
        self.linear_model.fit(x_train, y_train)

    def _predict(self, features: List[torch.Tensor]) -> int:
        feature_summary = self.feature_summarizer(features).numpy()
        label: np.ndarray = self.linear_model.predict(feature_summary)
        return label.item()
    
    def _predict_probabilities(self, features: List[torch.Tensor]) -> float:
        feature_summary = self.feature_summarizer(features).numpy()
        confidence: np.ndarray = self.linear_model.decision_function(feature_summary)
        return confidence.item()

    @log_time(function_prefix='linear_window_snorkel_predict')
    def _batch_probabilities(self, features: List[List[torch.Tensor]]) -> List[float]:
        feature_summaries: List[np.ndarray] = list(map(lambda f: self.feature_summarizer(f).numpy(), features))
        batch_np: np.ndarray = TensorList(feature_summaries).numpy()
        confidence_batch: np.ndarray = self.linear_model.decision_function(batch_np)
        return list(map(lambda conf: conf.item(), TensorList([confidence_batch]).to_list()))

    @log_time(function_prefix='linear_window_predict')
    def _batch_predict(self, features: List[List[torch.Tensor]]) -> List[int]:
        feature_summaries: List[np.ndarray] = list(map(lambda f: self.feature_summarizer(f).numpy(), features))
        batch_np: np.ndarray = TensorList(feature_summaries).numpy()
        label_batch: np.ndarray = self.linear_model.predict(batch_np)
        return list(map(lambda label: label.item(), TensorList([label_batch]).to_list()))

    @overrides
    def __str__(self):
        return f'LinearWindowFunction({self.context_window})({self.feature_extractor})'
Exemple #4
0
class BagWindowFunction(WindowFunction):
    def __init__(
        self,
        positive_label: str,
        context_window: int,
        feature_extractor: FeatureExtractor,
        feature_summarizer: Callable[[List[Any]], torch.Tensor] = FeatureCollator.sum,
        use_batch: bool = True,
        threshold: Optional[float] = 0.7,
        parallelize: bool = False, # shared memory issue locally
        use_sparse: bool = False, # store dictionary as sparse matrix
        **kwargs,
    ):
        self.positive_label = positive_label
        self.feature_extractor = feature_extractor
        self.context_window = context_window
        self.parallelize = parallelize
        super(BagWindowFunction, self).__init__(
            positive_label,
            feature_extractor,
            context_window,
            use_batch=use_batch,
            threshold=threshold,
            **kwargs,
        )

        self.dictionary = SparseTensorList() if use_sparse else TensorList()
        self.labels = TensorList()
        self.feature_summarizer = feature_summarizer
    
    def _train_model(self, training_data: List[Tuple[List[str], List[Any], str]]):
        for i, (sentence_window, feature_window, label) in enumerate(training_data):
            if is_negative(label):
                continue
            window_summary = self.feature_summarizer(feature_window)
            self.dictionary.append(window_summary.float())
            self.labels.append(torch.Tensor([label_index(label)]))
    
    def _predict(self, features: List[torch.Tensor]) -> int:
        feature_summary = self.feature_summarizer(features)
        labels = self.labels.tensor().long()
        found_index = self.dictionary.contains(feature_summary)
        if found_index == -1:
            return 0 # no confidence (should be ABSTAIN)
        label = labels[found_index]
        return label.item()
    
    def _predict_probabilities(self, features: List[torch.Tensor]) -> float:
        feature_summary = self.feature_summarizer(features)
        labels = self.labels.tensor().long()
        found_index = self.dictionary.contains(feature_summary)
        if found_index == -1:
            return 0. # no confidence (should be ABSTAIN)
        label = labels[found_index]
        return 2 * label.item() - 1 # (0 -> -1 ,1 -> 1)
    
    def _batch_predict(self, features: List[List[torch.Tensor]]) -> List[int]:
        return list(map(lambda f: self._predict(f), features))
    
    def _batch_probabilities(self, features: List[List[torch.Tensor]]) -> List[float]:
        if self.parallelize:
            pool = multiprocessing.Pool()
            self.dictionary.share_memory()
            self.labels.share_memory()
            parallel_res = pool.map(self._predict_probabilities, features)
            return list(parallel_res)
        else:
            return list(map(self._predict_probabilities, features))
    
    @overrides
    def __str__(self):
        return f'BagWindowFunction({self.context_window})({self.feature_extractor})'