Example #1
0
class InferenceKata:
    tokenizer: Union[str, Tokenizer]
    spec: KataSpec = KataSpec()
    max_length: Optional[int] = None

    def __post_init__(self):
        if type(self.tokenizer) is str:
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer,
                                                           use_fast=True)

    @overload
    def of(self, values: List[str]):
        ...

    # noinspection PyTypeChecker
    def of(self, *args):
        def preprocess_function(examples):
            assert callable(self.tokenizer), "tokenizer is not callable"
            return self.tokenizer(examples["text"],
                                  truncation=True,
                                  max_length=self.max_length)

        values = args[0]

        spec = KataSpec(remove_columns=["text"],
                        max_seq_length=self.max_length)
        collator = DataCollatorWithPadding(self.tokenizer,
                                           padding="max_length",
                                           max_length=self.max_length)
        return DictKata(spec, preprocess_function, collator, {"text": values})
Example #2
0
class Cola:
    tokenizer: Union[str, Tokenizer]
    spec: KataSpec = KataSpec()

    def __post_init__(self):
        if type(self.tokenizer) is str:
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer, use_fast=True)

    # noinspection PyTypeChecker
    def __call__(self):
        dataset = load_dataset(path="glue", name="cola", cache_dir=self.spec.cache_dir)
        encoded = dataset.map(function=self.preprocess_function,
                              batched=self.spec.batch_preprocess,
                              batch_size=self.spec.batch_preprocess_size,
                              remove_columns=["sentence"])

        data_collator = DataCollatorWithPadding(self.tokenizer, "max_length", self.spec.max_seq_length)

        return (
            DatasetKata(self.spec, encoded["train"], data_collator),
            DatasetKata(self.spec, encoded["validation"], data_collator),
            DatasetKata(self.spec, encoded["test"], data_collator)
        )

    def preprocess_function(self, examples):
        assert callable(self.tokenizer), "tokenizer is not callable"
        return self.tokenizer(examples["sentence"], truncation=True,
                              return_token_type_ids=True  # FIXME
                              )

    @staticmethod
    def labels():
        return ["0", "1"]
Example #3
0
    def katas(self):
        if not (os.path.exists(self.folder)):
            logger.info(f"Download GermEval18 dataset")
            _download_extract_downstream_data(self.folder + "/train.tsv",
                                              proxies=None)

        def preprocess_function(examples):
            labels = GermEval18.labels(True)
            examples["coarse_label"] = [
                labels.index(x) for x in examples["coarse_label"]
            ]

            labels = GermEval18.labels(False)
            examples["fine_label"] = [
                labels.index(x) for x in examples["fine_label"]
            ]

            assert callable(self.tokenizer), "tokenizer is not callable"
            return self.tokenizer(
                examples["text"],
                truncation=True,
                return_token_type_ids=True,  # FIXME
                max_length=self.spec.max_seq_length)

        spec = KataSpec(remove_columns=["text"],
                        max_seq_length=self.spec.max_seq_length)
        collator = DataCollatorWithPadding(self.tokenizer,
                                           padding="max_length",
                                           max_length=self.spec.max_seq_length)
        return (TsvKata(spec, preprocess_function, collator,
                        "../data/germeval18/train.tsv"),
                TsvKata(spec, preprocess_function, collator,
                        "../data/germeval18/test.tsv"))
Example #4
0
class GermEval18:
    folder: str
    tokenizer: Union[str, Tokenizer]
    spec: KataSpec = KataSpec()

    def __post_init__(self):
        if type(self.tokenizer) is str:
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer,
                                                           use_fast=True)

    # noinspection PyTypeChecker
    def katas(self):
        if not (os.path.exists(self.folder)):
            logger.info(f"Download GermEval18 dataset")
            _download_extract_downstream_data(self.folder + "/train.tsv",
                                              proxies=None)

        def preprocess_function(examples):
            labels = GermEval18.labels(True)
            examples["coarse_label"] = [
                labels.index(x) for x in examples["coarse_label"]
            ]

            labels = GermEval18.labels(False)
            examples["fine_label"] = [
                labels.index(x) for x in examples["fine_label"]
            ]

            assert callable(self.tokenizer), "tokenizer is not callable"
            return self.tokenizer(
                examples["text"],
                truncation=True,
                return_token_type_ids=True,  # FIXME
                max_length=self.spec.max_seq_length)

        spec = KataSpec(remove_columns=["text"],
                        max_seq_length=self.spec.max_seq_length)
        collator = DataCollatorWithPadding(self.tokenizer,
                                           padding="max_length",
                                           max_length=self.spec.max_seq_length)
        return (TsvKata(spec, preprocess_function, collator,
                        "../data/germeval18/train.tsv"),
                TsvKata(spec, preprocess_function, collator,
                        "../data/germeval18/test.tsv"))

    @staticmethod
    def labels(coarse: bool):
        return ["OTHER", "OFFENSE"] if coarse else [
            "OTHER", "OFFENSE", "ABUSE", "INSULT", "PROFANITY"
        ]
Example #5
0
    def of(self, *args):
        def preprocess_function(examples):
            assert callable(self.tokenizer), "tokenizer is not callable"
            return self.tokenizer(examples["text"],
                                  truncation=True,
                                  max_length=self.max_length)

        values = args[0]

        spec = KataSpec(remove_columns=["text"],
                        max_seq_length=self.max_length)
        collator = DataCollatorWithPadding(self.tokenizer,
                                           padding="max_length",
                                           max_length=self.max_length)
        return DictKata(spec, preprocess_function, collator, {"text": values})
Example #6
0
    def __call__(self):
        if not (os.path.exists(self.folder)):
            logger.info(f"Download ToxicComments dataset")
            _download_extract_downstream_data(self.folder + "/train.tsv",
                                              proxies=None)

        def preprocess_function(examples):
            # TODO: use to utility class
            def hot_encoding(labels: Optional[str]):
                label_ids = [0] * len(ToxicComments.labels())

                if labels is None:
                    return label_ids

                for l in labels.split(","):
                    if l != "":
                        label_ids[ToxicComments.labels().index(l)] = 1
                return label_ids

            examples["label"] = [hot_encoding(x) for x in examples["label"]]

            assert callable(self.tokenizer), "tokenizer is not callable"
            return self.tokenizer(
                examples["text"],
                truncation=True,
                return_token_type_ids=True,  # FIXME
                max_length=self.spec.max_seq_length)

        spec = KataSpec(remove_columns=["text"],
                        max_seq_length=self.spec.max_seq_length)
        collator = DataCollatorWithPadding(self.tokenizer,
                                           padding="max_length",
                                           max_length=self.spec.max_seq_length)
        return (TsvKata(spec,
                        preprocess_function,
                        collator,
                        "../data/toxic-comments/train.tsv",
                        quote_char='"'),
                TsvKata(spec,
                        preprocess_function,
                        collator,
                        "../data/toxic-comments/val.tsv",
                        quote_char='"'))
Example #7
0
from aikido.aikidoka.adaptive_transformer import AdaptiveTransformer
from aikido.dojo.base_dojo import BaseDojo
from aikido.dojo.listener import LearningRateStepListener, SeedListener, CudaListener
from aikido.dojo.listener.gradient_clipping_listener import GradientClippingListener
from aikido.kata.factory.germeval18 import GermEval18
from aikido.modeling.language_model import LanguageModel
from aikido.modeling.nn.head.text_classifier_head import TextClassifierHead
from aikido.modeling.optimization import transformers_adam_w_optimizer

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    # model_name = "bert-base-german-cased"
    model_name = "german-nlp-group/electra-base-german-uncased"
    kata1, kata2 = GermEval18("../data/germeval18", model_name,
                              KataSpec(max_seq_length=100)).katas()
    # kata1, kata2 = Cola()()

    language_model = LanguageModel.load(model_name)
    prediction_head = TextClassifierHead(labels=GermEval18.labels(coarse=True),
                                         label_name="coarse_label")

    model = AdaptiveTransformer(language_model, [prediction_head])

    optimizer = transformers_adam_w_optimizer()

    listeners = [
        LearningRateStepListener(),
        GradientClippingListener(),
        CudaListener(),
        # EvaluationListener(kata2, metric, evaluate_every=100),
Example #8
0
from aikido.__api__.kata_spec import KataSpec
from aikido.aikidoka.adaptive_transformer import AdaptiveTransformer
from aikido.dojo.base_dojo import BaseDojo
from aikido.dojo.listener import LearningRateStepListener, SeedListener
from aikido.dojo.listener.gradient_clipping_listener import GradientClippingListener
from aikido.kata.factory.toxic_comments import ToxicComments
from aikido.modeling.language_model import LanguageModel
from aikido.modeling.nn.head.text_classifier_head import TextClassifierHead
from aikido.modeling.optimization import transformers_adam_w_optimizer

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    # model_name = "bert-base-uncased"
    model_name = "german-nlp-group/electra-base-german-uncased"
    kata1, kata2 = ToxicComments("../data/toxic-comments", model_name, KataSpec(max_seq_length=100))()

    language_model = LanguageModel.load(model_name)
    prediction_head = TextClassifierHead(ToxicComments.labels(), "labels", multiclass=True)

    model = AdaptiveTransformer(language_model, [prediction_head])
    # model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(ToxicComments.labels()))
    optimizer = transformers_adam_w_optimizer()

    listeners = [LearningRateStepListener(),
                 # AdjustLossListener(),
                 GradientClippingListener(),
                 # EvaluationListener(kata2, metrics="acc", evaluate_every=50),
                 SeedListener(42)]

    dojo = BaseDojo(DojoKun(optimizer, dans=1, batch_size=32, grad_acc_steps=5), listeners)
Example #9
0
class GermEval14:
    tokenizer: Union[str, Tokenizer]
    spec: KataSpec = KataSpec()
    lang: str = "en"
    task: str = "ner"
    label_all_tokens: bool = True

    def __post_init__(self):
        if type(self.tokenizer) is str:
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer,
                                                           use_fast=True)
            assert isinstance(self.tokenizer, PreTrainedTokenizerFast)

    # noinspection PyTypeChecker
    def __call__(self):
        dataset = load_dataset("germeval_14", cache_dir=self.spec.cache_dir)
        encoded = dataset.map(function=self.tokenize_and_align_labels,
                              batched=self.spec.batch_preprocess,
                              batch_size=self.spec.batch_preprocess_size,
                              remove_columns=[
                                  'id', 'ner_tags', 'nested_ner_tags',
                                  'source', 'tokens'
                              ])

        data_collator = DataCollatorForTokenClassification(self.tokenizer)

        return (DatasetKata(self.spec, encoded["train"], data_collator),
                DatasetKata(self.spec, encoded["validation"], data_collator),
                DatasetKata(self.spec, encoded["test"], data_collator))

    def tokenize_and_align_labels(self, examples):
        assert callable(self.tokenizer), "tokenizer is not callable"
        tokenized_inputs = self.tokenizer(examples["tokens"],
                                          truncation=True,
                                          is_split_into_words=True,
                                          return_token_type_ids=False)

        labels = []
        for i, label in enumerate(examples[f"{self.task}_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(
                        label[word_idx] if self.label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    @staticmethod
    def labels():
        return [
            "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
            "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
        ]

    @staticmethod
    def metrics():
        return [GermEval14Metric()]
Example #10
0
class ToxicComments:
    folder: str
    tokenizer: Union[str, Tokenizer]
    spec: KataSpec = KataSpec()

    def __post_init__(self):
        if type(self.tokenizer) is str:
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer,
                                                           use_fast=True)

    # noinspection PyTypeChecker
    def __call__(self):
        if not (os.path.exists(self.folder)):
            logger.info(f"Download ToxicComments dataset")
            _download_extract_downstream_data(self.folder + "/train.tsv",
                                              proxies=None)

        def preprocess_function(examples):
            # TODO: use to utility class
            def hot_encoding(labels: Optional[str]):
                label_ids = [0] * len(ToxicComments.labels())

                if labels is None:
                    return label_ids

                for l in labels.split(","):
                    if l != "":
                        label_ids[ToxicComments.labels().index(l)] = 1
                return label_ids

            examples["label"] = [hot_encoding(x) for x in examples["label"]]

            assert callable(self.tokenizer), "tokenizer is not callable"
            return self.tokenizer(
                examples["text"],
                truncation=True,
                return_token_type_ids=True,  # FIXME
                max_length=self.spec.max_seq_length)

        spec = KataSpec(remove_columns=["text"],
                        max_seq_length=self.spec.max_seq_length)
        collator = DataCollatorWithPadding(self.tokenizer,
                                           padding="max_length",
                                           max_length=self.spec.max_seq_length)
        return (TsvKata(spec,
                        preprocess_function,
                        collator,
                        "../data/toxic-comments/train.tsv",
                        quote_char='"'),
                TsvKata(spec,
                        preprocess_function,
                        collator,
                        "../data/toxic-comments/val.tsv",
                        quote_char='"'))

    @staticmethod
    def labels():
        return [
            "toxic", "severe_toxic", "obscene", "threat", "insult",
            "identity_hate"
        ]
Example #11
0
class Conll03:
    tokenizer: Union[str, Tokenizer]
    spec: KataSpec = KataSpec()
    lang: str = "en"
    task: str = "ner"
    label_all_tokens: bool = True

    def __post_init__(self):
        if type(self.tokenizer) is str:
            self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer,
                                                           use_fast=True)
            assert isinstance(self.tokenizer, PreTrainedTokenizerFast)

    # noinspection PyTypeChecker
    def __call__(self):
        if self.lang == "de":
            # dataset = load_dataset('../config/conll2003-de/', 'conll2003-de', cache_dir=self.spec.cache_dir)
            dataset = load_dataset(
                'D:/IntellijProjects/aikido2/aikido/kata/config/conll2003-de/',
                'conll2003-de',
                cache_dir=self.spec.cache_dir)
        elif self.lang == "en":
            dataset = load_dataset("conll2003", cache_dir=self.spec.cache_dir)
        else:
            raise ValueError(
                f"cannot handle language {self.lang} for conll2003")

        encoded = dataset.map(function=self.tokenize_and_align_labels,
                              batched=self.spec.batch_preprocess,
                              batch_size=self.spec.batch_preprocess_size,
                              remove_columns=[
                                  "id", "tokens", "pos_tags", "chunk_tags",
                                  "ner_tags"
                              ])

        data_collator = DataCollatorForTokenClassification(self.tokenizer)

        return (DatasetKata(self.spec, encoded["train"], data_collator),
                DatasetKata(self.spec, encoded["validation"], data_collator),
                DatasetKata(self.spec, encoded["test"], data_collator))

    def tokenize_and_align_labels(self, examples):
        assert callable(self.tokenizer), "tokenizer is not callable"
        tokenized_inputs = self.tokenizer(examples["tokens"],
                                          truncation=True,
                                          is_split_into_words=True,
                                          return_token_type_ids=False)

        labels = []
        for i, label in enumerate(examples[f"{self.task}_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(
                        label[word_idx] if self.label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        max = np.max(list(map(lambda x: len(x),
                              tokenized_inputs["input_ids"])))

        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    @staticmethod
    def labels():
        return [
            "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG",
            "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"
        ]

    @staticmethod
    def metrics():
        return [Conll2003Metric()]