Ejemplo n.º 1
0
        acc = sum([output[f'{metric_tag}/true'] for output in outputs]) / \
              sum([output[f'{metric_tag}/all'] for output in outputs])

        self.log_dict(
            dictionary={loss_tag: loss, metric_tag: acc},
            on_step=False, on_epoch=True, prog_bar=True, logger=True
        )
        if ret:
            return acc

    return step, epoch_end


task_info = TaskInfo(
    task_name='pos',
    metric_name='acc',
    build_dataset=build_dataset,
    validation_method=validation_method
)


def add_task_specific_args(parent_parser):
    parser = ArgumentParser(parents=[parent_parser], add_help=False)
    parser.add_argument('--tune', action='store_true')
    parser.add_argument('--offline', action='store_true')
    parser.add_argument('--patience', type=int, default=5)
    parser.add_argument('--gpus_per_trial', type=float, default=1.0)
    parser.add_argument('--cpus_per_trial', type=float, default=1.0)
    parser.add_argument('--seed', type=int, default=19980524)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--data_dir', type=str, required=True)
Ejemplo n.º 2
0
    test_step, test_epoch_end = task_info.validation_method(
        multi_metric,
        loss_tag='test_loss',
        metric_tags={
            task_name: f"test_{task_module.task_info.metric_name}"
            for task_name, task_module in task_builder.items()
        },
        metric_tag=f"test_{task_info.metric_name}")

    model.test_dataloader = types.MethodType(test_dataloader, model)
    model.test_step = types.MethodType(test_step, model)
    model.test_epoch_end = types.MethodType(test_epoch_end, model)


task_info = TaskInfo(task_name='multitask',
                     metric_name='metric_mean',
                     build_dataset=build_dataset,
                     validation_method=validation_method)


def add_task_specific_args(parent_parser):
    parser = ArgumentParser(parents=[parent_parser], add_help=False)
    parser.add_argument('--seed', type=int, default=19980524)
    parser.add_argument('--tune', action='store_true')
    parser.add_argument('--offline', action='store_true')
    parser.add_argument('--patience', type=int, default=5)
    parser.add_argument('--batch_size', type=int, default=8)
    parser.add_argument('--gpus_per_trial', type=float, default=1.0)
    parser.add_argument('--cpus_per_trial', type=float, default=5.0)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_samples', type=int, default=10)
    parser.add_argument('--tau', type=float, default=0.8)
Ejemplo n.º 3
0
def build_method(model: Model, task_info: TaskInfo):
    multi_dataset, multi_metric = task_info.build_dataset(
        model,
        seg=model.hparams.seg_data_dir,
        pos=model.hparams.pos_data_dir,
        ner=model.hparams.ner_data_dir,
        dep=model.hparams.dep_data_dir,
        sdp=model.hparams.sdp_data_dir,
        srl=model.hparams.srl_data_dir)

    def train_dataloader(self):
        multi_dataloader = {
            task:
            torch.utils.data.DataLoader(task_dataset[datasets.Split.TRAIN],
                                        batch_size=self.hparams.batch_size,
                                        collate_fn=collate,
                                        num_workers=self.hparams.num_workers,
                                        pin_memory=True)
            for task, task_dataset in multi_dataset.items()
        }
        res = MultiTaskDataloader(tau=self.hparams.tau, **multi_dataloader)
        return res

    def training_step(self, batch, batch_idx):
        result = self(**batch)
        self.log("loss", result.loss.item())
        return {"loss": result.loss}

    def val_dataloader(self):
        return [
            torch.utils.data.DataLoader(
                task_dataset[datasets.Split.VALIDATION],
                batch_size=self.hparams.batch_size,
                collate_fn=collate,
                num_workers=self.hparams.num_workers,
                pin_memory=True)
            for task, task_dataset in multi_dataset.items()
        ]

    def test_dataloader(self):
        return [
            torch.utils.data.DataLoader(task_dataset[datasets.Split.TEST],
                                        batch_size=self.hparams.batch_size,
                                        collate_fn=collate,
                                        num_workers=self.hparams.num_workers,
                                        pin_memory=True)
            for task, task_dataset in multi_dataset.items()
        ]

    # AdamW + LR scheduler
    def configure_optimizers(self: Model):
        num_epoch_steps = sum(
            (len(dataset[datasets.Split.TRAIN]) + self.hparams.batch_size -
             1) // self.hparams.batch_size
            for dataset in multi_dataset.values())
        num_train_steps = num_epoch_steps * self.hparams.max_epochs
        optimizer, scheduler = optimization.from_argparse_args(
            self.hparams,
            model=self,
            num_train_steps=num_train_steps,
            n_transformer_layers=self.transformer.config.num_hidden_layers)
        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]

    model.configure_optimizers = types.MethodType(configure_optimizers, model)

    model.train_dataloader = types.MethodType(train_dataloader, model)
    model.training_step = types.MethodType(training_step, model)

    validation_step, validation_epoch_end = task_info.validation_method(
        multi_metric,
        loss_tag='val_loss',
        metric_tags={
            task_name: f"val_{task_module.task_info.metric_name}"
            for task_name, task_module in task_builder.items()
        },
        metric_tag=f"val_{task_info.metric_name}")

    model.val_dataloader = types.MethodType(val_dataloader, model)
    model.validation_step = types.MethodType(validation_step, model)
    model.validation_epoch_end = types.MethodType(validation_epoch_end, model)

    test_step, test_epoch_end = task_info.validation_method(
        multi_metric,
        loss_tag='test_loss',
        metric_tags={
            task_name: f"test_{task_module.task_info.metric_name}"
            for task_name, task_module in task_builder.items()
        },
        metric_tag=f"test_{task_info.metric_name}")

    model.test_dataloader = types.MethodType(test_dataloader, model)
    model.test_step = types.MethodType(test_step, model)
    model.test_epoch_end = types.MethodType(test_epoch_end, model)
Ejemplo n.º 4
0
            loss_tag: loss,
            metric_tag: las
        },
                      on_step=False,
                      on_epoch=True,
                      prog_bar=True,
                      logger=True)

        if ret:
            return las

    return step, epoch_end


task_info = TaskInfo(task_name='dep',
                     metric_name='las',
                     build_dataset=build_dataset,
                     validation_method=validation_method)


def add_task_specific_args(parent_parser):
    parser = ArgumentParser(parents=[parent_parser], add_help=False)
    parser.add_argument('--tune', action='store_true')
    parser.add_argument('--offline', action='store_true')
    parser.add_argument('--patience', type=int, default=5)
    parser.add_argument('--seed', type=int, default=19980524)
    parser.add_argument('--gpus_per_trial', type=float, default=1.0)
    parser.add_argument('--cpus_per_trial', type=float, default=1)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_samples', type=int, default=10)
    parser.add_argument('--data_dir', type=str, required=True)
Ejemplo n.º 5
0
from argparse import ArgumentParser
from ltp.data import dataset as datasets
from ltp import optimization
from ltp.data.utils import collate
from ltp.transformer_biaffine import TransformerBiaffine as Model, sdp_loss

import numpy as np
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from transformers import AutoTokenizer

from ltp.utils import TaskInfo, common_train, map2device, convert2npy

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='sdp', metric_name='f1')

# CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_segmention.py --data_dir=data/seg --num_labels=2 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --auto_lr_find=lr


def get_graph_entities(arcs, labels):
    arcs = torch.nonzero(arcs, as_tuple=False).cpu().detach().numpy()
    labels = labels.cpu().detach().numpy()

    res = []
    for arc in arcs:
        arc = tuple(arc)
        label = labels[arc]
        res.append((*arc, label))

    return set(res)
Ejemplo n.º 6
0
            loss_tag: loss,
            metric_tag: f1
        },
                      on_step=False,
                      on_epoch=True,
                      prog_bar=True,
                      logger=True)

        if ret:
            return f1

    return step, epoch_end


task_info = TaskInfo(task_name='srl',
                     metric_name='f1',
                     build_dataset=build_dataset,
                     validation_method=validation_method)


def add_task_specific_args(parent_parser):
    parser = ArgumentParser(parents=[parent_parser], add_help=False)
    parser.add_argument('--tune', action='store_true')
    parser.add_argument('--offline', action='store_true')
    parser.add_argument('--patience', type=int, default=5)
    parser.add_argument('--gpus_per_trial', type=float, default=1.0)
    parser.add_argument('--cpus_per_trial', type=float, default=1.0)
    parser.add_argument('--seed', type=int, default=19980524)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_samples', type=int, default=10)
    parser.add_argument('--data_dir', type=str, required=True)
Ejemplo n.º 7
0
def build_method(model: Model, task_info: TaskInfo):
    (multi_dataset, distill_datasets,
     distill_datasets_extra), multi_metric = build_dataset(
         model,
         seg=model.hparams.seg_data_dir,
         pos=model.hparams.pos_data_dir,
         ner=model.hparams.ner_data_dir,
         dep=model.hparams.dep_data_dir,
         sdp=model.hparams.sdp_data_dir,
         srl=model.hparams.srl_data_dir)

    disable_distill = {
        'seg': model.hparams.disable_seg,
        'pos': model.hparams.disable_pos,
        'ner': model.hparams.disable_ner,
        'dep': model.hparams.disable_dep,
        'sdp': model.hparams.disable_sdp,
        'srl': model.hparams.disable_srl,
    }

    disable_distill = {
        task
        for task, disable in disable_distill.items() if disable
    }

    temperature_scheduler = flsw_temperature_scheduler_builder(
        beta=model.hparams.distill_beta,
        gamma=model.hparams.distill_gamma,
        base_temperature=model.hparams.temperature)

    def train_dataloader(self):
        multi_dataloader = {
            task:
            torch.utils.data.DataLoader(task_dataset,
                                        batch_size=None,
                                        num_workers=self.hparams.num_workers,
                                        pin_memory=True,
                                        shuffle=True)
            for task, task_dataset in distill_datasets.items()
        }
        res = MultiTaskDataloader(tau=self.hparams.tau, **multi_dataloader)
        return res

    def training_step(self: Model, batch, batch_idx):
        task = batch['task']
        target_logits = batch.pop('logits')
        result = self(**batch)
        norm_loss = result.loss

        if task not in disable_distill:
            distill_loss = distill_loss_map[task](
                batch,
                result,
                target_logits,
                temperature_scheduler,
                model,
                extra=distill_datasets_extra[task])
            distill_loss_weight = self.global_step / self.num_train_steps
            loss = distill_loss_weight * norm_loss + (
                1 - distill_loss_weight) * distill_loss

            self.log("distill_loss", distill_loss.item())
            self.log("norm_loss", norm_loss.item())
            self.log("loss", loss.item())
            return {"loss": loss}
        else:
            self.log("loss", norm_loss.item())
            return {"loss": norm_loss}

    def val_dataloader(self):
        return [
            torch.utils.data.DataLoader(
                task_dataset[datasets.Split.VALIDATION],
                batch_size=getattr(self.hparams, f'{task}_batch_size')
                or self.hparams.batch_size,
                collate_fn=collate,
                num_workers=self.hparams.num_workers,
                pin_memory=True)
            for task, task_dataset in multi_dataset.items()
        ]

    def test_dataloader(self):
        return [
            torch.utils.data.DataLoader(
                task_dataset[datasets.Split.TEST],
                batch_size=getattr(self.hparams, f'{task}_batch_size')
                or self.hparams.batch_size,
                collate_fn=collate,
                num_workers=self.hparams.num_workers,
                pin_memory=True)
            for task, task_dataset in multi_dataset.items()
        ]

    # AdamW + LR scheduler
    def configure_optimizers(self: Model):
        num_epoch_steps = sum(
            len(dataset) for dataset in distill_datasets.values())
        num_train_steps = num_epoch_steps * self.hparams.max_epochs
        setattr(self, 'num_train_steps', num_train_steps)
        optimizer, scheduler = optimization.from_argparse_args(
            self.hparams,
            model=self,
            num_train_steps=num_train_steps,
            n_transformer_layers=self.transformer.config.num_hidden_layers)
        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]

    model.configure_optimizers = types.MethodType(configure_optimizers, model)

    model.train_dataloader = types.MethodType(train_dataloader, model)
    model.training_step = types.MethodType(training_step, model)

    validation_step, validation_epoch_end = task_info.validation_method(
        multi_metric, task=task_info.task_name, preffix='val')

    model.val_dataloader = types.MethodType(val_dataloader, model)
    model.validation_step = types.MethodType(validation_step, model)
    model.validation_epoch_end = types.MethodType(validation_epoch_end, model)

    test_step, test_epoch_end = task_info.validation_method(
        multi_metric, task=task_info.task_name, preffix='test')

    model.test_dataloader = types.MethodType(test_dataloader, model)
    model.test_step = types.MethodType(test_step, model)
    model.test_epoch_end = types.MethodType(test_epoch_end, model)
Ejemplo n.º 8
0
from ltp.data import dataset as datasets
from ltp.data.utils import collate
from ltp import optimization
from seqeval.metrics import f1_score
from ltp.transformer_rel_linear import TransformerRelLinear as Model

import pytorch_lightning as pl
from pytorch_lightning import Trainer

from transformers import AutoTokenizer

from ltp.utils import TaskInfo, common_train, map2device, convert2npy

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='ner', metric_name='f1')


# CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_named_entity_recognition.py --data_dir=data/ner --num_labels=13 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16

def build_dataset(model, data_dir):
    dataset = datasets.load_dataset(
        datasets.Bio,
        data_dir=data_dir,
        cache_dir=data_dir,
        bio=os.path.join(data_dir, "ner_labels.txt")
    )
    dataset.rename_column_('bio', 'labels')
    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True)

    def tokenize(examples):
Ejemplo n.º 9
0
from ltp.algorithms import eisner
from ltp.data import dataset as datasets
from ltp import optimization
from ltp.data.utils import collate
from ltp.transformer_biaffine import TransformerBiaffine as Model

import pytorch_lightning as pl
from pytorch_lightning import Trainer

from transformers import AutoTokenizer

from ltp.utils import TaskInfo, common_train, map2device, convert2npy

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='dep', metric_name='las')


# CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_segmention.py --data_dir=data/seg --num_labels=2 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --auto_lr_find=lr

def build_dataset(model, data_dir):
    dataset = datasets.load_dataset(
        datasets.Conllu,
        data_dir=data_dir,
        cache_dir=data_dir,
        deprel=os.path.join(data_dir, "dep_labels.txt")
    )
    dataset.remove_columns_(["id", "lemma", "upos", "xpos", "feats", "deps", "misc"])
    dataset.rename_column_('deprel', 'labels')

    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True)
Ejemplo n.º 10
0
import torch.utils.data
import os
from tqdm import tqdm
from argparse import ArgumentParser
from ltp.data import dataset as datasets
from ltp.data.utils import collate
from ltp.transformer_linear import TransformerLinear as Model
from pytorch_lightning import Trainer

from transformers import AutoTokenizer
from ltp import optimization
from ltp.utils import TaskInfo, common_train, map2device, convert2npy

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='pos', metric_name='acc')


# CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_part_of_speech.py --data_dir=data/pos --num_labels=27 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --auto_lr_find=lr

def build_dataset(model, data_dir):
    dataset = datasets.load_dataset(
        datasets.Conllu,
        data_dir=data_dir,
        cache_dir=data_dir,
        xpos=os.path.join(data_dir, "pos_labels.txt")
    )
    dataset.remove_columns_(["id", "lemma", "upos", "feats", "head", "deprel", "deps", "misc"])
    dataset.rename_column_('xpos', 'labels')

    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True)
Ejemplo n.º 11
0
import torch.utils.data
from pytorch_lightning import Trainer

import ltp
from ltp import (optimization, task_segmention, task_part_of_speech,
                 task_named_entity_recognition, task_dependency_parsing,
                 task_semantic_dependency_parsing, task_semantic_role_labeling)
from ltp.data import dataset as datasets
from ltp.data.utils import collate, MultiTaskDataloader
from ltp.transformer_multitask import TransformerMultiTask as Model
from ltp.utils import TaskInfo, common_train
from ltp.utils import deploy_model

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='multitask', metric_name='metric_mean')

# CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/multitask.py --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --seg_data_dir=data/seg --pos_data_dir=data/pos --ner_data_dir=data/ner

task_builder = {
    task_segmention.task_info.task_name: task_segmention,
    task_part_of_speech.task_info.task_name: task_part_of_speech,
    task_named_entity_recognition.task_info.task_name:
    task_named_entity_recognition,
    task_dependency_parsing.task_info.task_name: task_dependency_parsing,
    task_semantic_dependency_parsing.task_info.task_name:
    task_semantic_dependency_parsing,
    task_semantic_role_labeling.task_info.task_name:
    task_semantic_role_labeling,
}
Ejemplo n.º 12
0
import torch.utils.data
import torch.nn.functional as F

from pytorch_lightning import Trainer

import ltp
from ltp import optimization, multitask
from ltp.data import dataset as datasets
from ltp.data.utils import collate, MultiTaskDataloader
from ltp.transformer_multitask import TransformerMultiTask as Model
from ltp.utils import TaskInfo, common_train, deploy_model
from ltp.multitask import validation_method

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='distill', metric_name='metric_mean')


def kd_ce_loss(logits_S, logits_T, temperature=1):
    '''
    Calculate the cross entropy between logits_S and logits_T

    :param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
    :param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
    :param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,)
    '''
    if isinstance(temperature, torch.Tensor) and temperature.dim() > 0:
        temperature = temperature.unsqueeze(-1)
    beta_logits_T = logits_T / temperature
    beta_logits_S = logits_S / temperature
    p_T = F.softmax(beta_logits_T, dim=-1)