class InferenceKata: tokenizer: Union[str, Tokenizer] spec: KataSpec = KataSpec() max_length: Optional[int] = None def __post_init__(self): if type(self.tokenizer) is str: self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer, use_fast=True) @overload def of(self, values: List[str]): ... # noinspection PyTypeChecker def of(self, *args): def preprocess_function(examples): assert callable(self.tokenizer), "tokenizer is not callable" return self.tokenizer(examples["text"], truncation=True, max_length=self.max_length) values = args[0] spec = KataSpec(remove_columns=["text"], max_seq_length=self.max_length) collator = DataCollatorWithPadding(self.tokenizer, padding="max_length", max_length=self.max_length) return DictKata(spec, preprocess_function, collator, {"text": values})
class Cola: tokenizer: Union[str, Tokenizer] spec: KataSpec = KataSpec() def __post_init__(self): if type(self.tokenizer) is str: self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer, use_fast=True) # noinspection PyTypeChecker def __call__(self): dataset = load_dataset(path="glue", name="cola", cache_dir=self.spec.cache_dir) encoded = dataset.map(function=self.preprocess_function, batched=self.spec.batch_preprocess, batch_size=self.spec.batch_preprocess_size, remove_columns=["sentence"]) data_collator = DataCollatorWithPadding(self.tokenizer, "max_length", self.spec.max_seq_length) return ( DatasetKata(self.spec, encoded["train"], data_collator), DatasetKata(self.spec, encoded["validation"], data_collator), DatasetKata(self.spec, encoded["test"], data_collator) ) def preprocess_function(self, examples): assert callable(self.tokenizer), "tokenizer is not callable" return self.tokenizer(examples["sentence"], truncation=True, return_token_type_ids=True # FIXME ) @staticmethod def labels(): return ["0", "1"]
def katas(self): if not (os.path.exists(self.folder)): logger.info(f"Download GermEval18 dataset") _download_extract_downstream_data(self.folder + "/train.tsv", proxies=None) def preprocess_function(examples): labels = GermEval18.labels(True) examples["coarse_label"] = [ labels.index(x) for x in examples["coarse_label"] ] labels = GermEval18.labels(False) examples["fine_label"] = [ labels.index(x) for x in examples["fine_label"] ] assert callable(self.tokenizer), "tokenizer is not callable" return self.tokenizer( examples["text"], truncation=True, return_token_type_ids=True, # FIXME max_length=self.spec.max_seq_length) spec = KataSpec(remove_columns=["text"], max_seq_length=self.spec.max_seq_length) collator = DataCollatorWithPadding(self.tokenizer, padding="max_length", max_length=self.spec.max_seq_length) return (TsvKata(spec, preprocess_function, collator, "../data/germeval18/train.tsv"), TsvKata(spec, preprocess_function, collator, "../data/germeval18/test.tsv"))
class GermEval18: folder: str tokenizer: Union[str, Tokenizer] spec: KataSpec = KataSpec() def __post_init__(self): if type(self.tokenizer) is str: self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer, use_fast=True) # noinspection PyTypeChecker def katas(self): if not (os.path.exists(self.folder)): logger.info(f"Download GermEval18 dataset") _download_extract_downstream_data(self.folder + "/train.tsv", proxies=None) def preprocess_function(examples): labels = GermEval18.labels(True) examples["coarse_label"] = [ labels.index(x) for x in examples["coarse_label"] ] labels = GermEval18.labels(False) examples["fine_label"] = [ labels.index(x) for x in examples["fine_label"] ] assert callable(self.tokenizer), "tokenizer is not callable" return self.tokenizer( examples["text"], truncation=True, return_token_type_ids=True, # FIXME max_length=self.spec.max_seq_length) spec = KataSpec(remove_columns=["text"], max_seq_length=self.spec.max_seq_length) collator = DataCollatorWithPadding(self.tokenizer, padding="max_length", max_length=self.spec.max_seq_length) return (TsvKata(spec, preprocess_function, collator, "../data/germeval18/train.tsv"), TsvKata(spec, preprocess_function, collator, "../data/germeval18/test.tsv")) @staticmethod def labels(coarse: bool): return ["OTHER", "OFFENSE"] if coarse else [ "OTHER", "OFFENSE", "ABUSE", "INSULT", "PROFANITY" ]
def of(self, *args): def preprocess_function(examples): assert callable(self.tokenizer), "tokenizer is not callable" return self.tokenizer(examples["text"], truncation=True, max_length=self.max_length) values = args[0] spec = KataSpec(remove_columns=["text"], max_seq_length=self.max_length) collator = DataCollatorWithPadding(self.tokenizer, padding="max_length", max_length=self.max_length) return DictKata(spec, preprocess_function, collator, {"text": values})
def __call__(self): if not (os.path.exists(self.folder)): logger.info(f"Download ToxicComments dataset") _download_extract_downstream_data(self.folder + "/train.tsv", proxies=None) def preprocess_function(examples): # TODO: use to utility class def hot_encoding(labels: Optional[str]): label_ids = [0] * len(ToxicComments.labels()) if labels is None: return label_ids for l in labels.split(","): if l != "": label_ids[ToxicComments.labels().index(l)] = 1 return label_ids examples["label"] = [hot_encoding(x) for x in examples["label"]] assert callable(self.tokenizer), "tokenizer is not callable" return self.tokenizer( examples["text"], truncation=True, return_token_type_ids=True, # FIXME max_length=self.spec.max_seq_length) spec = KataSpec(remove_columns=["text"], max_seq_length=self.spec.max_seq_length) collator = DataCollatorWithPadding(self.tokenizer, padding="max_length", max_length=self.spec.max_seq_length) return (TsvKata(spec, preprocess_function, collator, "../data/toxic-comments/train.tsv", quote_char='"'), TsvKata(spec, preprocess_function, collator, "../data/toxic-comments/val.tsv", quote_char='"'))
from aikido.aikidoka.adaptive_transformer import AdaptiveTransformer from aikido.dojo.base_dojo import BaseDojo from aikido.dojo.listener import LearningRateStepListener, SeedListener, CudaListener from aikido.dojo.listener.gradient_clipping_listener import GradientClippingListener from aikido.kata.factory.germeval18 import GermEval18 from aikido.modeling.language_model import LanguageModel from aikido.modeling.nn.head.text_classifier_head import TextClassifierHead from aikido.modeling.optimization import transformers_adam_w_optimizer if __name__ == "__main__": logging.basicConfig(level=logging.INFO) # model_name = "bert-base-german-cased" model_name = "german-nlp-group/electra-base-german-uncased" kata1, kata2 = GermEval18("../data/germeval18", model_name, KataSpec(max_seq_length=100)).katas() # kata1, kata2 = Cola()() language_model = LanguageModel.load(model_name) prediction_head = TextClassifierHead(labels=GermEval18.labels(coarse=True), label_name="coarse_label") model = AdaptiveTransformer(language_model, [prediction_head]) optimizer = transformers_adam_w_optimizer() listeners = [ LearningRateStepListener(), GradientClippingListener(), CudaListener(), # EvaluationListener(kata2, metric, evaluate_every=100),
from aikido.__api__.kata_spec import KataSpec from aikido.aikidoka.adaptive_transformer import AdaptiveTransformer from aikido.dojo.base_dojo import BaseDojo from aikido.dojo.listener import LearningRateStepListener, SeedListener from aikido.dojo.listener.gradient_clipping_listener import GradientClippingListener from aikido.kata.factory.toxic_comments import ToxicComments from aikido.modeling.language_model import LanguageModel from aikido.modeling.nn.head.text_classifier_head import TextClassifierHead from aikido.modeling.optimization import transformers_adam_w_optimizer if __name__ == "__main__": logging.basicConfig(level=logging.INFO) # model_name = "bert-base-uncased" model_name = "german-nlp-group/electra-base-german-uncased" kata1, kata2 = ToxicComments("../data/toxic-comments", model_name, KataSpec(max_seq_length=100))() language_model = LanguageModel.load(model_name) prediction_head = TextClassifierHead(ToxicComments.labels(), "labels", multiclass=True) model = AdaptiveTransformer(language_model, [prediction_head]) # model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(ToxicComments.labels())) optimizer = transformers_adam_w_optimizer() listeners = [LearningRateStepListener(), # AdjustLossListener(), GradientClippingListener(), # EvaluationListener(kata2, metrics="acc", evaluate_every=50), SeedListener(42)] dojo = BaseDojo(DojoKun(optimizer, dans=1, batch_size=32, grad_acc_steps=5), listeners)
class GermEval14: tokenizer: Union[str, Tokenizer] spec: KataSpec = KataSpec() lang: str = "en" task: str = "ner" label_all_tokens: bool = True def __post_init__(self): if type(self.tokenizer) is str: self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer, use_fast=True) assert isinstance(self.tokenizer, PreTrainedTokenizerFast) # noinspection PyTypeChecker def __call__(self): dataset = load_dataset("germeval_14", cache_dir=self.spec.cache_dir) encoded = dataset.map(function=self.tokenize_and_align_labels, batched=self.spec.batch_preprocess, batch_size=self.spec.batch_preprocess_size, remove_columns=[ 'id', 'ner_tags', 'nested_ner_tags', 'source', 'tokens' ]) data_collator = DataCollatorForTokenClassification(self.tokenizer) return (DatasetKata(self.spec, encoded["train"], data_collator), DatasetKata(self.spec, encoded["validation"], data_collator), DatasetKata(self.spec, encoded["test"], data_collator)) def tokenize_and_align_labels(self, examples): assert callable(self.tokenizer), "tokenizer is not callable" tokenized_inputs = self.tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, return_token_type_ids=False) labels = [] for i, label in enumerate(examples[f"{self.task}_tags"]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label[word_idx]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append( label[word_idx] if self.label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs @staticmethod def labels(): return [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] @staticmethod def metrics(): return [GermEval14Metric()]
class ToxicComments: folder: str tokenizer: Union[str, Tokenizer] spec: KataSpec = KataSpec() def __post_init__(self): if type(self.tokenizer) is str: self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer, use_fast=True) # noinspection PyTypeChecker def __call__(self): if not (os.path.exists(self.folder)): logger.info(f"Download ToxicComments dataset") _download_extract_downstream_data(self.folder + "/train.tsv", proxies=None) def preprocess_function(examples): # TODO: use to utility class def hot_encoding(labels: Optional[str]): label_ids = [0] * len(ToxicComments.labels()) if labels is None: return label_ids for l in labels.split(","): if l != "": label_ids[ToxicComments.labels().index(l)] = 1 return label_ids examples["label"] = [hot_encoding(x) for x in examples["label"]] assert callable(self.tokenizer), "tokenizer is not callable" return self.tokenizer( examples["text"], truncation=True, return_token_type_ids=True, # FIXME max_length=self.spec.max_seq_length) spec = KataSpec(remove_columns=["text"], max_seq_length=self.spec.max_seq_length) collator = DataCollatorWithPadding(self.tokenizer, padding="max_length", max_length=self.spec.max_seq_length) return (TsvKata(spec, preprocess_function, collator, "../data/toxic-comments/train.tsv", quote_char='"'), TsvKata(spec, preprocess_function, collator, "../data/toxic-comments/val.tsv", quote_char='"')) @staticmethod def labels(): return [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]
class Conll03: tokenizer: Union[str, Tokenizer] spec: KataSpec = KataSpec() lang: str = "en" task: str = "ner" label_all_tokens: bool = True def __post_init__(self): if type(self.tokenizer) is str: self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer, use_fast=True) assert isinstance(self.tokenizer, PreTrainedTokenizerFast) # noinspection PyTypeChecker def __call__(self): if self.lang == "de": # dataset = load_dataset('../config/conll2003-de/', 'conll2003-de', cache_dir=self.spec.cache_dir) dataset = load_dataset( 'D:/IntellijProjects/aikido2/aikido/kata/config/conll2003-de/', 'conll2003-de', cache_dir=self.spec.cache_dir) elif self.lang == "en": dataset = load_dataset("conll2003", cache_dir=self.spec.cache_dir) else: raise ValueError( f"cannot handle language {self.lang} for conll2003") encoded = dataset.map(function=self.tokenize_and_align_labels, batched=self.spec.batch_preprocess, batch_size=self.spec.batch_preprocess_size, remove_columns=[ "id", "tokens", "pos_tags", "chunk_tags", "ner_tags" ]) data_collator = DataCollatorForTokenClassification(self.tokenizer) return (DatasetKata(self.spec, encoded["train"], data_collator), DatasetKata(self.spec, encoded["validation"], data_collator), DatasetKata(self.spec, encoded["test"], data_collator)) def tokenize_and_align_labels(self, examples): assert callable(self.tokenizer), "tokenizer is not callable" tokenized_inputs = self.tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, return_token_type_ids=False) labels = [] for i, label in enumerate(examples[f"{self.task}_tags"]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label[word_idx]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append( label[word_idx] if self.label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) max = np.max(list(map(lambda x: len(x), tokenized_inputs["input_ids"]))) tokenized_inputs["labels"] = labels return tokenized_inputs @staticmethod def labels(): return [ "[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH" ] @staticmethod def metrics(): return [Conll2003Metric()]