def apply(self, docs=None, split=0, train=False, clear=True, **kwargs): """Apply features to the specified candidates. :param docs: If provided, apply features to all the candidates in these documents. :param split: If docs is None, apply features to the candidates in this particular split. :param train: Whether or not to update the global key set of features and the features of candidates. :param clear: Whether or not to clear the features table before applying features. """ if docs: # Call apply on the specified docs for all splits split = ALL_SPLITS super(Featurizer, self).apply( docs, split=split, train=train, clear=clear, **kwargs ) # Needed to sync the bulk operations self.session.commit() else: # Only grab the docs containing candidates from the given split. split_docs = get_docs_from_split( self.session, self.candidate_classes, split ) super(Featurizer, self).apply( split_docs, split=split, train=train, clear=clear, **kwargs ) # Needed to sync the bulk operations self.session.commit()
def apply( # type: ignore self, docs: Optional[Collection[Document]] = None, split: int = 0, train: bool = False, clear: bool = True, parallelism: Optional[int] = None, progress_bar: bool = True, ) -> None: """Apply features to the specified candidates. :param docs: If provided, apply features to all the candidates in these documents. :param split: If docs is None, apply features to the candidates in this particular split. :type split: int :param train: Whether or not to update the global key set of features and the features of candidates. :type train: bool :param clear: Whether or not to clear the features table before applying features. :type clear: bool :param parallelism: How many threads to use for extraction. This will override the parallelism value used to initialize the Featurizer if it is provided. :type parallelism: int :param progress_bar: Whether or not to display a progress bar. The progress bar is measured per document. :type progress_bar: bool """ if docs: # Call apply on the specified docs for all splits # TODO: split is int split = ALL_SPLITS # type: ignore super().apply( docs, split=split, train=train, clear=clear, parallelism=parallelism, progress_bar=progress_bar, ) # Needed to sync the bulk operations self.session.commit() else: # Only grab the docs containing candidates from the given split. split_docs = get_docs_from_split( self.session, self.candidate_classes, split ) super().apply( split_docs, split=split, train=train, clear=clear, parallelism=parallelism, progress_bar=progress_bar, ) # Needed to sync the bulk operations self.session.commit()
def apply(self, docs=None, split=0, train=False, lfs=None, clear=True, **kwargs): """Apply the labels of the specified candidates based on the provided LFs. :param docs: If provided, apply the LFs to all the candidates in these documents. :param split: If docs is None, apply the LFs to the candidates in this particular split. :param train: Whether or not to update the global key set of labels and the labels of candidates. :param lfs: A list of lists of labeling functions to apply. Each list should correspond with the candidate_classes used to initialize the Labeler. :param clear: Whether or not to clear the labels table before applying these LFs. """ if lfs is None: raise ValueError("Please provide a list of labeling functions.") if len(lfs) != len(self.candidate_classes): raise ValueError("Please provide LFs for each candidate class.") self.lfs = lfs if docs: # Call apply on the specified docs for all splits split = ALL_SPLITS super(Labeler, self).apply(docs, split=split, train=train, lfs=self.lfs, clear=clear, **kwargs) # Needed to sync the bulk operations self.session.commit() else: # Only grab the docs containing candidates from the given split. split_docs = get_docs_from_split(self.session, self.candidate_classes, split) super(Labeler, self).apply(split_docs, split=split, train=train, lfs=self.lfs, clear=clear, **kwargs) # Needed to sync the bulk operations self.session.commit()
def apply( self, docs=None, split=0, train=False, lfs=None, clear=True, parallelism=None, progress_bar=True, ): """Apply the labels of the specified candidates based on the provided LFs. :param docs: If provided, apply the LFs to all the candidates in these documents. :param split: If docs is None, apply the LFs to the candidates in this particular split. :type split: int :param train: Whether or not to update the global key set of labels and the labels of candidates. :type train: bool :param lfs: A list of lists of labeling functions to apply. Each list should correspond with the candidate_classes used to initialize the Labeler. :type lfs: list of lists :param clear: Whether or not to clear the labels table before applying these LFs. :type clear: bool :param parallelism: How many threads to use for extraction. This will override the parallelism value used to initialize the Labeler if it is provided. :type parallelism: int :param progress_bar: Whether or not to display a progress bar. The progress bar is measured per document. :type progress_bar: bool :raises ValueError: If labeling functions are not provided for each candidate class. """ if lfs is None: raise ValueError("Please provide a list of labeling functions.") if len(lfs) != len(self.candidate_classes): raise ValueError("Please provide LFs for each candidate class.") self.lfs = lfs if docs: # Call apply on the specified docs for all splits split = ALL_SPLITS super(Labeler, self).apply( docs, split=split, train=train, lfs=self.lfs, clear=clear, parallelism=parallelism, progress_bar=progress_bar, ) # Needed to sync the bulk operations self.session.commit() else: # Only grab the docs containing candidates from the given split. split_docs = get_docs_from_split(self.session, self.candidate_classes, split) super(Labeler, self).apply( split_docs, split=split, train=train, lfs=self.lfs, clear=clear, parallelism=parallelism, progress_bar=progress_bar, ) # Needed to sync the bulk operations self.session.commit()