Exemple #1
0
    def apply(self, docs=None, split=0, train=False, clear=True, **kwargs):
        """Apply features to the specified candidates.

        :param docs: If provided, apply features to all the candidates in these
            documents.
        :param split: If docs is None, apply features to the candidates in this
            particular split.
        :param train: Whether or not to update the global key set of features and
            the features of candidates.
        :param clear: Whether or not to clear the features table before applying
            features.
        """
        if docs:
            # Call apply on the specified docs for all splits
            split = ALL_SPLITS
            super(Featurizer, self).apply(
                docs, split=split, train=train, clear=clear, **kwargs
            )
            # Needed to sync the bulk operations
            self.session.commit()
        else:
            # Only grab the docs containing candidates from the given split.
            split_docs = get_docs_from_split(
                self.session, self.candidate_classes, split
            )
            super(Featurizer, self).apply(
                split_docs, split=split, train=train, clear=clear, **kwargs
            )
            # Needed to sync the bulk operations
            self.session.commit()
Exemple #2
0
    def apply(  # type: ignore
        self,
        docs: Optional[Collection[Document]] = None,
        split: int = 0,
        train: bool = False,
        clear: bool = True,
        parallelism: Optional[int] = None,
        progress_bar: bool = True,
    ) -> None:
        """Apply features to the specified candidates.

        :param docs: If provided, apply features to all the candidates in these
            documents.
        :param split: If docs is None, apply features to the candidates in this
            particular split.
        :type split: int
        :param train: Whether or not to update the global key set of features
            and the features of candidates.
        :type train: bool
        :param clear: Whether or not to clear the features table before
            applying features.
        :type clear: bool
        :param parallelism: How many threads to use for extraction. This will
            override the parallelism value used to initialize the Featurizer if
            it is provided.
        :type parallelism: int
        :param progress_bar: Whether or not to display a progress bar. The
            progress bar is measured per document.
        :type progress_bar: bool
        """
        if docs:
            # Call apply on the specified docs for all splits
            # TODO: split is int
            split = ALL_SPLITS  # type: ignore
            super().apply(
                docs,
                split=split,
                train=train,
                clear=clear,
                parallelism=parallelism,
                progress_bar=progress_bar,
            )
            # Needed to sync the bulk operations
            self.session.commit()
        else:
            # Only grab the docs containing candidates from the given split.
            split_docs = get_docs_from_split(
                self.session, self.candidate_classes, split
            )
            super().apply(
                split_docs,
                split=split,
                train=train,
                clear=clear,
                parallelism=parallelism,
                progress_bar=progress_bar,
            )
            # Needed to sync the bulk operations
            self.session.commit()
Exemple #3
0
    def apply(self,
              docs=None,
              split=0,
              train=False,
              lfs=None,
              clear=True,
              **kwargs):
        """Apply the labels of the specified candidates based on the provided LFs.

        :param docs: If provided, apply the LFs to all the candidates in these
            documents.
        :param split: If docs is None, apply the LFs to the candidates in this
            particular split.
        :param train: Whether or not to update the global key set of labels and
            the labels of candidates.
        :param lfs: A list of lists of labeling functions to apply. Each list
            should correspond with the candidate_classes used to initialize the
            Labeler.
        :param clear: Whether or not to clear the labels table before applying
            these LFs.
        """
        if lfs is None:
            raise ValueError("Please provide a list of labeling functions.")

        if len(lfs) != len(self.candidate_classes):
            raise ValueError("Please provide LFs for each candidate class.")

        self.lfs = lfs
        if docs:
            # Call apply on the specified docs for all splits
            split = ALL_SPLITS
            super(Labeler, self).apply(docs,
                                       split=split,
                                       train=train,
                                       lfs=self.lfs,
                                       clear=clear,
                                       **kwargs)
            # Needed to sync the bulk operations
            self.session.commit()
        else:
            # Only grab the docs containing candidates from the given split.
            split_docs = get_docs_from_split(self.session,
                                             self.candidate_classes, split)
            super(Labeler, self).apply(split_docs,
                                       split=split,
                                       train=train,
                                       lfs=self.lfs,
                                       clear=clear,
                                       **kwargs)
            # Needed to sync the bulk operations
            self.session.commit()
Exemple #4
0
    def apply(
        self,
        docs=None,
        split=0,
        train=False,
        lfs=None,
        clear=True,
        parallelism=None,
        progress_bar=True,
    ):
        """Apply the labels of the specified candidates based on the provided LFs.

        :param docs: If provided, apply the LFs to all the candidates in these
            documents.
        :param split: If docs is None, apply the LFs to the candidates in this
            particular split.
        :type split: int
        :param train: Whether or not to update the global key set of labels and
            the labels of candidates.
        :type train: bool
        :param lfs: A list of lists of labeling functions to apply. Each list
            should correspond with the candidate_classes used to initialize the
            Labeler.
        :type lfs: list of lists
        :param clear: Whether or not to clear the labels table before applying
            these LFs.
        :type clear: bool
        :param parallelism: How many threads to use for extraction. This will
            override the parallelism value used to initialize the Labeler if
            it is provided.
        :type parallelism: int
        :param progress_bar: Whether or not to display a progress bar. The
            progress bar is measured per document.
        :type progress_bar: bool

        :raises ValueError: If labeling functions are not provided for each
            candidate class.
        """
        if lfs is None:
            raise ValueError("Please provide a list of labeling functions.")

        if len(lfs) != len(self.candidate_classes):
            raise ValueError("Please provide LFs for each candidate class.")

        self.lfs = lfs
        if docs:
            # Call apply on the specified docs for all splits
            split = ALL_SPLITS
            super(Labeler, self).apply(
                docs,
                split=split,
                train=train,
                lfs=self.lfs,
                clear=clear,
                parallelism=parallelism,
                progress_bar=progress_bar,
            )
            # Needed to sync the bulk operations
            self.session.commit()
        else:
            # Only grab the docs containing candidates from the given split.
            split_docs = get_docs_from_split(self.session,
                                             self.candidate_classes, split)
            super(Labeler, self).apply(
                split_docs,
                split=split,
                train=train,
                lfs=self.lfs,
                clear=clear,
                parallelism=parallelism,
                progress_bar=progress_bar,
            )
            # Needed to sync the bulk operations
            self.session.commit()