def test_tokenizer(): """Test one hot featurizer.""" tokenizer = LabelTokenizer() dummy = 'LABEL1' assert tokenizer.tokenize(dummy) == ['LABEL1'] dummy = 'LABEL1,LABEL2' assert tokenizer.tokenize(dummy) == ['LABEL1,LABEL2'] dummy = 'LABEL1 LABEL2' assert tokenizer.tokenize(dummy) == ['LABEL1 LABEL2']
def test_tokenizer_multilabel(): """Test one hot featurizer.""" tokenizer = LabelTokenizer(multilabel_sep=',') dummy = 'LABEL1' assert tokenizer.tokenize(dummy) == ['LABEL1'] dummy = 'LABEL1,LABEL2' assert tokenizer.tokenize(dummy) == ['LABEL1', 'LABEL2'] dummy = 'LABEL1 LABEL2' assert tokenizer.tokenize(dummy) == ['LABEL1 LABEL2']
def __init__(self, one_hot: bool = False, multilabel_sep: Optional[str] = None, labels: Optional[Union[Iterable[str], str]] = None) -> None: """Initializes the LabelFetaurizer. Parameters ---------- one_hot : bool, optional Set for one-hot encoded outputs, defaults to False multilabel_sep : str, optional If given, splits the input label into multiple labels using the given separator, defaults to None. labels: Union[Iterable[str], str], optional If given, sets the labels and the ordering is used to map the labels to indices. That means the first item in this list will have label id 0, the next one id 1, etc.. When not provided, indices are assigned as labels are encountered during preprocessing. The list can also be provided as a file with a label on each line. """ self.one_hot = one_hot self.multilabel_sep = multilabel_sep self.tokenizer = LabelTokenizer(multilabel_sep=self.multilabel_sep) if labels is not None: if isinstance(labels, str): # Labels if a file with open(labels, 'r') as f: label_list: List[str] = f.read().splitlines() elif isinstance(labels, Iterable): label_list = list(labels) self.label_given = True self.vocab = odict( (label, i) for i, label in enumerate(label_list)) self.label_count_dict = {label: 0 for label in self.vocab} else: self.label_given = False self.vocab = odict() self.label_count_dict = dict() self.register_attrs('vocab') self.register_attrs('label_count_dict')
def __init__(self, one_hot: bool = False, multilabel_sep: Optional[str] = None) -> None: """Initializes the LabelFetaurizer. Parameters ---------- one_hot : bool, optional Set for one-hot encoded outputs, defaults to False multilabel_sep : str, optional If given, splits the input label into multiple labels using the given separator, defaults to None. """ self.one_hot = one_hot self.multilabel_sep = multilabel_sep tokenizer = LabelTokenizer(self.multilabel_sep) super().__init__(tokenizer=tokenizer, unk_token=None, pad_token=None)
def __init__(self, one_hot: bool = False, multilabel_sep: Optional[str] = None, labels: Optional[Sequence[str]] = None) -> None: """Initializes the LabelFetaurizer. Parameters ---------- one_hot : bool, optional Set for one-hot encoded outputs, defaults to False multilabel_sep : str, optional If given, splits the input label into multiple labels using the given separator, defaults to None. labels: Sequence[str], optional If given, sets the labels and the ordering is used to map the labels to indices. That means the first item in this list will have label id 0, the next one id 1, etc.. When not provided, indices are assigned as labels are encountered during preprocessing. """ self.one_hot = one_hot self.multilabel_sep = multilabel_sep self.tokenizer = LabelTokenizer(self.multilabel_sep) if labels is not None: self.label_given = True self.vocab = odict((label, i) for i, label in enumerate(labels)) self.label_count_dict = {label: 0 for label in self.vocab} else: self.label_given = False self.vocab = odict() self.label_count_dict: Dict[str, int] = dict() self.register_attrs('vocab') self.register_attrs('label_count_dict')