acc = sum([output[f'{metric_tag}/true'] for output in outputs]) / \ sum([output[f'{metric_tag}/all'] for output in outputs]) self.log_dict( dictionary={loss_tag: loss, metric_tag: acc}, on_step=False, on_epoch=True, prog_bar=True, logger=True ) if ret: return acc return step, epoch_end task_info = TaskInfo( task_name='pos', metric_name='acc', build_dataset=build_dataset, validation_method=validation_method ) def add_task_specific_args(parent_parser): parser = ArgumentParser(parents=[parent_parser], add_help=False) parser.add_argument('--tune', action='store_true') parser.add_argument('--offline', action='store_true') parser.add_argument('--patience', type=int, default=5) parser.add_argument('--gpus_per_trial', type=float, default=1.0) parser.add_argument('--cpus_per_trial', type=float, default=1.0) parser.add_argument('--seed', type=int, default=19980524) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--data_dir', type=str, required=True)
test_step, test_epoch_end = task_info.validation_method( multi_metric, loss_tag='test_loss', metric_tags={ task_name: f"test_{task_module.task_info.metric_name}" for task_name, task_module in task_builder.items() }, metric_tag=f"test_{task_info.metric_name}") model.test_dataloader = types.MethodType(test_dataloader, model) model.test_step = types.MethodType(test_step, model) model.test_epoch_end = types.MethodType(test_epoch_end, model) task_info = TaskInfo(task_name='multitask', metric_name='metric_mean', build_dataset=build_dataset, validation_method=validation_method) def add_task_specific_args(parent_parser): parser = ArgumentParser(parents=[parent_parser], add_help=False) parser.add_argument('--seed', type=int, default=19980524) parser.add_argument('--tune', action='store_true') parser.add_argument('--offline', action='store_true') parser.add_argument('--patience', type=int, default=5) parser.add_argument('--batch_size', type=int, default=8) parser.add_argument('--gpus_per_trial', type=float, default=1.0) parser.add_argument('--cpus_per_trial', type=float, default=5.0) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--num_samples', type=int, default=10) parser.add_argument('--tau', type=float, default=0.8)
def build_method(model: Model, task_info: TaskInfo): multi_dataset, multi_metric = task_info.build_dataset( model, seg=model.hparams.seg_data_dir, pos=model.hparams.pos_data_dir, ner=model.hparams.ner_data_dir, dep=model.hparams.dep_data_dir, sdp=model.hparams.sdp_data_dir, srl=model.hparams.srl_data_dir) def train_dataloader(self): multi_dataloader = { task: torch.utils.data.DataLoader(task_dataset[datasets.Split.TRAIN], batch_size=self.hparams.batch_size, collate_fn=collate, num_workers=self.hparams.num_workers, pin_memory=True) for task, task_dataset in multi_dataset.items() } res = MultiTaskDataloader(tau=self.hparams.tau, **multi_dataloader) return res def training_step(self, batch, batch_idx): result = self(**batch) self.log("loss", result.loss.item()) return {"loss": result.loss} def val_dataloader(self): return [ torch.utils.data.DataLoader( task_dataset[datasets.Split.VALIDATION], batch_size=self.hparams.batch_size, collate_fn=collate, num_workers=self.hparams.num_workers, pin_memory=True) for task, task_dataset in multi_dataset.items() ] def test_dataloader(self): return [ torch.utils.data.DataLoader(task_dataset[datasets.Split.TEST], batch_size=self.hparams.batch_size, collate_fn=collate, num_workers=self.hparams.num_workers, pin_memory=True) for task, task_dataset in multi_dataset.items() ] # AdamW + LR scheduler def configure_optimizers(self: Model): num_epoch_steps = sum( (len(dataset[datasets.Split.TRAIN]) + self.hparams.batch_size - 1) // self.hparams.batch_size for dataset in multi_dataset.values()) num_train_steps = num_epoch_steps * self.hparams.max_epochs optimizer, scheduler = optimization.from_argparse_args( self.hparams, model=self, num_train_steps=num_train_steps, n_transformer_layers=self.transformer.config.num_hidden_layers) return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}] model.configure_optimizers = types.MethodType(configure_optimizers, model) model.train_dataloader = types.MethodType(train_dataloader, model) model.training_step = types.MethodType(training_step, model) validation_step, validation_epoch_end = task_info.validation_method( multi_metric, loss_tag='val_loss', metric_tags={ task_name: f"val_{task_module.task_info.metric_name}" for task_name, task_module in task_builder.items() }, metric_tag=f"val_{task_info.metric_name}") model.val_dataloader = types.MethodType(val_dataloader, model) model.validation_step = types.MethodType(validation_step, model) model.validation_epoch_end = types.MethodType(validation_epoch_end, model) test_step, test_epoch_end = task_info.validation_method( multi_metric, loss_tag='test_loss', metric_tags={ task_name: f"test_{task_module.task_info.metric_name}" for task_name, task_module in task_builder.items() }, metric_tag=f"test_{task_info.metric_name}") model.test_dataloader = types.MethodType(test_dataloader, model) model.test_step = types.MethodType(test_step, model) model.test_epoch_end = types.MethodType(test_epoch_end, model)
loss_tag: loss, metric_tag: las }, on_step=False, on_epoch=True, prog_bar=True, logger=True) if ret: return las return step, epoch_end task_info = TaskInfo(task_name='dep', metric_name='las', build_dataset=build_dataset, validation_method=validation_method) def add_task_specific_args(parent_parser): parser = ArgumentParser(parents=[parent_parser], add_help=False) parser.add_argument('--tune', action='store_true') parser.add_argument('--offline', action='store_true') parser.add_argument('--patience', type=int, default=5) parser.add_argument('--seed', type=int, default=19980524) parser.add_argument('--gpus_per_trial', type=float, default=1.0) parser.add_argument('--cpus_per_trial', type=float, default=1) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--num_samples', type=int, default=10) parser.add_argument('--data_dir', type=str, required=True)
from argparse import ArgumentParser from ltp.data import dataset as datasets from ltp import optimization from ltp.data.utils import collate from ltp.transformer_biaffine import TransformerBiaffine as Model, sdp_loss import numpy as np import pytorch_lightning as pl from pytorch_lightning import Trainer from transformers import AutoTokenizer from ltp.utils import TaskInfo, common_train, map2device, convert2npy os.environ['TOKENIZERS_PARALLELISM'] = 'true' task_info = TaskInfo(task_name='sdp', metric_name='f1') # CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_segmention.py --data_dir=data/seg --num_labels=2 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --auto_lr_find=lr def get_graph_entities(arcs, labels): arcs = torch.nonzero(arcs, as_tuple=False).cpu().detach().numpy() labels = labels.cpu().detach().numpy() res = [] for arc in arcs: arc = tuple(arc) label = labels[arc] res.append((*arc, label)) return set(res)
loss_tag: loss, metric_tag: f1 }, on_step=False, on_epoch=True, prog_bar=True, logger=True) if ret: return f1 return step, epoch_end task_info = TaskInfo(task_name='srl', metric_name='f1', build_dataset=build_dataset, validation_method=validation_method) def add_task_specific_args(parent_parser): parser = ArgumentParser(parents=[parent_parser], add_help=False) parser.add_argument('--tune', action='store_true') parser.add_argument('--offline', action='store_true') parser.add_argument('--patience', type=int, default=5) parser.add_argument('--gpus_per_trial', type=float, default=1.0) parser.add_argument('--cpus_per_trial', type=float, default=1.0) parser.add_argument('--seed', type=int, default=19980524) parser.add_argument('--batch_size', type=int, default=16) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--num_samples', type=int, default=10) parser.add_argument('--data_dir', type=str, required=True)
def build_method(model: Model, task_info: TaskInfo): (multi_dataset, distill_datasets, distill_datasets_extra), multi_metric = build_dataset( model, seg=model.hparams.seg_data_dir, pos=model.hparams.pos_data_dir, ner=model.hparams.ner_data_dir, dep=model.hparams.dep_data_dir, sdp=model.hparams.sdp_data_dir, srl=model.hparams.srl_data_dir) disable_distill = { 'seg': model.hparams.disable_seg, 'pos': model.hparams.disable_pos, 'ner': model.hparams.disable_ner, 'dep': model.hparams.disable_dep, 'sdp': model.hparams.disable_sdp, 'srl': model.hparams.disable_srl, } disable_distill = { task for task, disable in disable_distill.items() if disable } temperature_scheduler = flsw_temperature_scheduler_builder( beta=model.hparams.distill_beta, gamma=model.hparams.distill_gamma, base_temperature=model.hparams.temperature) def train_dataloader(self): multi_dataloader = { task: torch.utils.data.DataLoader(task_dataset, batch_size=None, num_workers=self.hparams.num_workers, pin_memory=True, shuffle=True) for task, task_dataset in distill_datasets.items() } res = MultiTaskDataloader(tau=self.hparams.tau, **multi_dataloader) return res def training_step(self: Model, batch, batch_idx): task = batch['task'] target_logits = batch.pop('logits') result = self(**batch) norm_loss = result.loss if task not in disable_distill: distill_loss = distill_loss_map[task]( batch, result, target_logits, temperature_scheduler, model, extra=distill_datasets_extra[task]) distill_loss_weight = self.global_step / self.num_train_steps loss = distill_loss_weight * norm_loss + ( 1 - distill_loss_weight) * distill_loss self.log("distill_loss", distill_loss.item()) self.log("norm_loss", norm_loss.item()) self.log("loss", loss.item()) return {"loss": loss} else: self.log("loss", norm_loss.item()) return {"loss": norm_loss} def val_dataloader(self): return [ torch.utils.data.DataLoader( task_dataset[datasets.Split.VALIDATION], batch_size=getattr(self.hparams, f'{task}_batch_size') or self.hparams.batch_size, collate_fn=collate, num_workers=self.hparams.num_workers, pin_memory=True) for task, task_dataset in multi_dataset.items() ] def test_dataloader(self): return [ torch.utils.data.DataLoader( task_dataset[datasets.Split.TEST], batch_size=getattr(self.hparams, f'{task}_batch_size') or self.hparams.batch_size, collate_fn=collate, num_workers=self.hparams.num_workers, pin_memory=True) for task, task_dataset in multi_dataset.items() ] # AdamW + LR scheduler def configure_optimizers(self: Model): num_epoch_steps = sum( len(dataset) for dataset in distill_datasets.values()) num_train_steps = num_epoch_steps * self.hparams.max_epochs setattr(self, 'num_train_steps', num_train_steps) optimizer, scheduler = optimization.from_argparse_args( self.hparams, model=self, num_train_steps=num_train_steps, n_transformer_layers=self.transformer.config.num_hidden_layers) return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}] model.configure_optimizers = types.MethodType(configure_optimizers, model) model.train_dataloader = types.MethodType(train_dataloader, model) model.training_step = types.MethodType(training_step, model) validation_step, validation_epoch_end = task_info.validation_method( multi_metric, task=task_info.task_name, preffix='val') model.val_dataloader = types.MethodType(val_dataloader, model) model.validation_step = types.MethodType(validation_step, model) model.validation_epoch_end = types.MethodType(validation_epoch_end, model) test_step, test_epoch_end = task_info.validation_method( multi_metric, task=task_info.task_name, preffix='test') model.test_dataloader = types.MethodType(test_dataloader, model) model.test_step = types.MethodType(test_step, model) model.test_epoch_end = types.MethodType(test_epoch_end, model)
from ltp.data import dataset as datasets from ltp.data.utils import collate from ltp import optimization from seqeval.metrics import f1_score from ltp.transformer_rel_linear import TransformerRelLinear as Model import pytorch_lightning as pl from pytorch_lightning import Trainer from transformers import AutoTokenizer from ltp.utils import TaskInfo, common_train, map2device, convert2npy os.environ['TOKENIZERS_PARALLELISM'] = 'true' task_info = TaskInfo(task_name='ner', metric_name='f1') # CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_named_entity_recognition.py --data_dir=data/ner --num_labels=13 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 def build_dataset(model, data_dir): dataset = datasets.load_dataset( datasets.Bio, data_dir=data_dir, cache_dir=data_dir, bio=os.path.join(data_dir, "ner_labels.txt") ) dataset.rename_column_('bio', 'labels') tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True) def tokenize(examples):
from ltp.algorithms import eisner from ltp.data import dataset as datasets from ltp import optimization from ltp.data.utils import collate from ltp.transformer_biaffine import TransformerBiaffine as Model import pytorch_lightning as pl from pytorch_lightning import Trainer from transformers import AutoTokenizer from ltp.utils import TaskInfo, common_train, map2device, convert2npy os.environ['TOKENIZERS_PARALLELISM'] = 'true' task_info = TaskInfo(task_name='dep', metric_name='las') # CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_segmention.py --data_dir=data/seg --num_labels=2 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --auto_lr_find=lr def build_dataset(model, data_dir): dataset = datasets.load_dataset( datasets.Conllu, data_dir=data_dir, cache_dir=data_dir, deprel=os.path.join(data_dir, "dep_labels.txt") ) dataset.remove_columns_(["id", "lemma", "upos", "xpos", "feats", "deps", "misc"]) dataset.rename_column_('deprel', 'labels') tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True)
import torch.utils.data import os from tqdm import tqdm from argparse import ArgumentParser from ltp.data import dataset as datasets from ltp.data.utils import collate from ltp.transformer_linear import TransformerLinear as Model from pytorch_lightning import Trainer from transformers import AutoTokenizer from ltp import optimization from ltp.utils import TaskInfo, common_train, map2device, convert2npy os.environ['TOKENIZERS_PARALLELISM'] = 'true' task_info = TaskInfo(task_name='pos', metric_name='acc') # CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_part_of_speech.py --data_dir=data/pos --num_labels=27 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --auto_lr_find=lr def build_dataset(model, data_dir): dataset = datasets.load_dataset( datasets.Conllu, data_dir=data_dir, cache_dir=data_dir, xpos=os.path.join(data_dir, "pos_labels.txt") ) dataset.remove_columns_(["id", "lemma", "upos", "feats", "head", "deprel", "deps", "misc"]) dataset.rename_column_('xpos', 'labels') tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True)
import torch.utils.data from pytorch_lightning import Trainer import ltp from ltp import (optimization, task_segmention, task_part_of_speech, task_named_entity_recognition, task_dependency_parsing, task_semantic_dependency_parsing, task_semantic_role_labeling) from ltp.data import dataset as datasets from ltp.data.utils import collate, MultiTaskDataloader from ltp.transformer_multitask import TransformerMultiTask as Model from ltp.utils import TaskInfo, common_train from ltp.utils import deploy_model os.environ['TOKENIZERS_PARALLELISM'] = 'true' task_info = TaskInfo(task_name='multitask', metric_name='metric_mean') # CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/multitask.py --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --seg_data_dir=data/seg --pos_data_dir=data/pos --ner_data_dir=data/ner task_builder = { task_segmention.task_info.task_name: task_segmention, task_part_of_speech.task_info.task_name: task_part_of_speech, task_named_entity_recognition.task_info.task_name: task_named_entity_recognition, task_dependency_parsing.task_info.task_name: task_dependency_parsing, task_semantic_dependency_parsing.task_info.task_name: task_semantic_dependency_parsing, task_semantic_role_labeling.task_info.task_name: task_semantic_role_labeling, }
import torch.utils.data import torch.nn.functional as F from pytorch_lightning import Trainer import ltp from ltp import optimization, multitask from ltp.data import dataset as datasets from ltp.data.utils import collate, MultiTaskDataloader from ltp.transformer_multitask import TransformerMultiTask as Model from ltp.utils import TaskInfo, common_train, deploy_model from ltp.multitask import validation_method os.environ['TOKENIZERS_PARALLELISM'] = 'true' task_info = TaskInfo(task_name='distill', metric_name='metric_mean') def kd_ce_loss(logits_S, logits_T, temperature=1): ''' Calculate the cross entropy between logits_S and logits_T :param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels) :param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels) :param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,) ''' if isinstance(temperature, torch.Tensor) and temperature.dim() > 0: temperature = temperature.unsqueeze(-1) beta_logits_T = logits_T / temperature beta_logits_S = logits_S / temperature p_T = F.softmax(beta_logits_T, dim=-1)