Example #1
0
def test_tokenizer():
    """Test one hot featurizer."""
    tokenizer = LabelTokenizer()

    dummy = 'LABEL1'
    assert tokenizer.tokenize(dummy) == ['LABEL1']

    dummy = 'LABEL1,LABEL2'
    assert tokenizer.tokenize(dummy) == ['LABEL1,LABEL2']

    dummy = 'LABEL1 LABEL2'
    assert tokenizer.tokenize(dummy) == ['LABEL1 LABEL2']
Example #2
0
def test_tokenizer_multilabel():
    """Test one hot featurizer."""
    tokenizer = LabelTokenizer(multilabel_sep=',')

    dummy = 'LABEL1'
    assert tokenizer.tokenize(dummy) == ['LABEL1']

    dummy = 'LABEL1,LABEL2'
    assert tokenizer.tokenize(dummy) == ['LABEL1', 'LABEL2']

    dummy = 'LABEL1 LABEL2'
    assert tokenizer.tokenize(dummy) == ['LABEL1 LABEL2']
Example #3
0
    def __init__(self,
                 one_hot: bool = False,
                 multilabel_sep: Optional[str] = None,
                 labels: Optional[Union[Iterable[str], str]] = None) -> None:
        """Initializes the LabelFetaurizer.

        Parameters
        ----------
        one_hot : bool, optional
            Set for one-hot encoded outputs, defaults to False
        multilabel_sep : str, optional
            If given, splits the input label into multiple labels
            using the given separator, defaults to None.
        labels: Union[Iterable[str], str], optional
            If given, sets the labels and the ordering is used to map
            the labels to indices. That means the first item in this
            list will have label id 0, the next one id 1, etc..
            When not provided, indices are assigned as labels are
            encountered during preprocessing. The list can also be
            provided as a file with a label on each line.

        """
        self.one_hot = one_hot
        self.multilabel_sep = multilabel_sep
        self.tokenizer = LabelTokenizer(multilabel_sep=self.multilabel_sep)

        if labels is not None:
            if isinstance(labels, str):
                # Labels if a file
                with open(labels, 'r') as f:
                    label_list: List[str] = f.read().splitlines()
            elif isinstance(labels, Iterable):
                label_list = list(labels)

            self.label_given = True
            self.vocab = odict(
                (label, i) for i, label in enumerate(label_list))
            self.label_count_dict = {label: 0 for label in self.vocab}
        else:
            self.label_given = False
            self.vocab = odict()
            self.label_count_dict = dict()

        self.register_attrs('vocab')
        self.register_attrs('label_count_dict')
Example #4
0
    def __init__(self,
                 one_hot: bool = False,
                 multilabel_sep: Optional[str] = None) -> None:
        """Initializes the LabelFetaurizer.

        Parameters
        ----------
        one_hot : bool, optional
            Set for one-hot encoded outputs, defaults to False
        multilabel_sep : str, optional
            If given, splits the input label into multiple labels using
            the given separator, defaults to None.

        """
        self.one_hot = one_hot
        self.multilabel_sep = multilabel_sep

        tokenizer = LabelTokenizer(self.multilabel_sep)
        super().__init__(tokenizer=tokenizer, unk_token=None, pad_token=None)
Example #5
0
    def __init__(self,
                 one_hot: bool = False,
                 multilabel_sep: Optional[str] = None,
                 labels: Optional[Sequence[str]] = None) -> None:
        """Initializes the LabelFetaurizer.

        Parameters
        ----------
        one_hot : bool, optional
            Set for one-hot encoded outputs, defaults to False
        multilabel_sep : str, optional
            If given, splits the input label into multiple labels
            using the given separator, defaults to None.
        labels: Sequence[str], optional
            If given, sets the labels and the ordering is used to map
            the labels to indices. That means the first item in this
            list will have label id 0, the next one id 1, etc..
            When not provided, indices are assigned as labels are
            encountered during preprocessing.

        """
        self.one_hot = one_hot
        self.multilabel_sep = multilabel_sep
        self.tokenizer = LabelTokenizer(self.multilabel_sep)

        if labels is not None:
            self.label_given = True
            self.vocab = odict((label, i) for i, label in enumerate(labels))
            self.label_count_dict = {label: 0 for label in self.vocab}
        else:
            self.label_given = False
            self.vocab = odict()
            self.label_count_dict: Dict[str, int] = dict()

        self.register_attrs('vocab')
        self.register_attrs('label_count_dict')