Ejemplo n.º 1
0
        acc = sum([output[f'{metric_tag}/true'] for output in outputs]) / \
              sum([output[f'{metric_tag}/all'] for output in outputs])

        self.log_dict(
            dictionary={loss_tag: loss, metric_tag: acc},
            on_step=False, on_epoch=True, prog_bar=True, logger=True
        )
        if ret:
            return acc

    return step, epoch_end


task_info = TaskInfo(
    task_name='pos',
    metric_name='acc',
    build_dataset=build_dataset,
    validation_method=validation_method
)


def add_task_specific_args(parent_parser):
    parser = ArgumentParser(parents=[parent_parser], add_help=False)
    parser.add_argument('--tune', action='store_true')
    parser.add_argument('--offline', action='store_true')
    parser.add_argument('--patience', type=int, default=5)
    parser.add_argument('--gpus_per_trial', type=float, default=1.0)
    parser.add_argument('--cpus_per_trial', type=float, default=1.0)
    parser.add_argument('--seed', type=int, default=19980524)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--data_dir', type=str, required=True)
Ejemplo n.º 2
0
from argparse import ArgumentParser
from ltp.data import dataset as datasets
from ltp import optimization
from ltp.data.utils import collate
from ltp.transformer_biaffine import TransformerBiaffine as Model, sdp_loss

import numpy as np
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from transformers import AutoTokenizer

from ltp.utils import TaskInfo, common_train, map2device, convert2npy

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='sdp', metric_name='f1')

# CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_segmention.py --data_dir=data/seg --num_labels=2 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --auto_lr_find=lr


def get_graph_entities(arcs, labels):
    arcs = torch.nonzero(arcs, as_tuple=False).cpu().detach().numpy()
    labels = labels.cpu().detach().numpy()

    res = []
    for arc in arcs:
        arc = tuple(arc)
        label = labels[arc]
        res.append((*arc, label))

    return set(res)
Ejemplo n.º 3
0
    test_step, test_epoch_end = task_info.validation_method(
        multi_metric,
        loss_tag='test_loss',
        metric_tags={
            task_name: f"test_{task_module.task_info.metric_name}"
            for task_name, task_module in task_builder.items()
        },
        metric_tag=f"test_{task_info.metric_name}")

    model.test_dataloader = types.MethodType(test_dataloader, model)
    model.test_step = types.MethodType(test_step, model)
    model.test_epoch_end = types.MethodType(test_epoch_end, model)


task_info = TaskInfo(task_name='multitask',
                     metric_name='metric_mean',
                     build_dataset=build_dataset,
                     validation_method=validation_method)


def add_task_specific_args(parent_parser):
    parser = ArgumentParser(parents=[parent_parser], add_help=False)
    parser.add_argument('--seed', type=int, default=19980524)
    parser.add_argument('--tune', action='store_true')
    parser.add_argument('--offline', action='store_true')
    parser.add_argument('--patience', type=int, default=5)
    parser.add_argument('--batch_size', type=int, default=8)
    parser.add_argument('--gpus_per_trial', type=float, default=1.0)
    parser.add_argument('--cpus_per_trial', type=float, default=5.0)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_samples', type=int, default=10)
    parser.add_argument('--tau', type=float, default=0.8)
Ejemplo n.º 4
0
            loss_tag: loss,
            metric_tag: f1
        },
                      on_step=False,
                      on_epoch=True,
                      prog_bar=True,
                      logger=True)

        if ret:
            return f1

    return step, epoch_end


task_info = TaskInfo(task_name='srl',
                     metric_name='f1',
                     build_dataset=build_dataset,
                     validation_method=validation_method)


def add_task_specific_args(parent_parser):
    parser = ArgumentParser(parents=[parent_parser], add_help=False)
    parser.add_argument('--tune', action='store_true')
    parser.add_argument('--offline', action='store_true')
    parser.add_argument('--patience', type=int, default=5)
    parser.add_argument('--gpus_per_trial', type=float, default=1.0)
    parser.add_argument('--cpus_per_trial', type=float, default=1.0)
    parser.add_argument('--seed', type=int, default=19980524)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_samples', type=int, default=10)
    parser.add_argument('--data_dir', type=str, required=True)
Ejemplo n.º 5
0
            loss_tag: loss,
            metric_tag: las
        },
                      on_step=False,
                      on_epoch=True,
                      prog_bar=True,
                      logger=True)

        if ret:
            return las

    return step, epoch_end


task_info = TaskInfo(task_name='dep',
                     metric_name='las',
                     build_dataset=build_dataset,
                     validation_method=validation_method)


def add_task_specific_args(parent_parser):
    parser = ArgumentParser(parents=[parent_parser], add_help=False)
    parser.add_argument('--tune', action='store_true')
    parser.add_argument('--offline', action='store_true')
    parser.add_argument('--patience', type=int, default=5)
    parser.add_argument('--seed', type=int, default=19980524)
    parser.add_argument('--gpus_per_trial', type=float, default=1.0)
    parser.add_argument('--cpus_per_trial', type=float, default=1)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_samples', type=int, default=10)
    parser.add_argument('--data_dir', type=str, required=True)
Ejemplo n.º 6
0
from ltp.data import dataset as datasets
from ltp.data.utils import collate
from ltp import optimization
from seqeval.metrics import f1_score
from ltp.transformer_rel_linear import TransformerRelLinear as Model

import pytorch_lightning as pl
from pytorch_lightning import Trainer

from transformers import AutoTokenizer

from ltp.utils import TaskInfo, common_train, map2device, convert2npy

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='ner', metric_name='f1')


# CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_named_entity_recognition.py --data_dir=data/ner --num_labels=13 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16

def build_dataset(model, data_dir):
    dataset = datasets.load_dataset(
        datasets.Bio,
        data_dir=data_dir,
        cache_dir=data_dir,
        bio=os.path.join(data_dir, "ner_labels.txt")
    )
    dataset.rename_column_('bio', 'labels')
    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True)

    def tokenize(examples):
Ejemplo n.º 7
0
from ltp.algorithms import eisner
from ltp.data import dataset as datasets
from ltp import optimization
from ltp.data.utils import collate
from ltp.transformer_biaffine import TransformerBiaffine as Model

import pytorch_lightning as pl
from pytorch_lightning import Trainer

from transformers import AutoTokenizer

from ltp.utils import TaskInfo, common_train, map2device, convert2npy

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='dep', metric_name='las')


# CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_segmention.py --data_dir=data/seg --num_labels=2 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --auto_lr_find=lr

def build_dataset(model, data_dir):
    dataset = datasets.load_dataset(
        datasets.Conllu,
        data_dir=data_dir,
        cache_dir=data_dir,
        deprel=os.path.join(data_dir, "dep_labels.txt")
    )
    dataset.remove_columns_(["id", "lemma", "upos", "xpos", "feats", "deps", "misc"])
    dataset.rename_column_('deprel', 'labels')

    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True)
Ejemplo n.º 8
0
import torch.utils.data
import os
from tqdm import tqdm
from argparse import ArgumentParser
from ltp.data import dataset as datasets
from ltp.data.utils import collate
from ltp.transformer_linear import TransformerLinear as Model
from pytorch_lightning import Trainer

from transformers import AutoTokenizer
from ltp import optimization
from ltp.utils import TaskInfo, common_train, map2device, convert2npy

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='pos', metric_name='acc')


# CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/task_part_of_speech.py --data_dir=data/pos --num_labels=27 --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --auto_lr_find=lr

def build_dataset(model, data_dir):
    dataset = datasets.load_dataset(
        datasets.Conllu,
        data_dir=data_dir,
        cache_dir=data_dir,
        xpos=os.path.join(data_dir, "pos_labels.txt")
    )
    dataset.remove_columns_(["id", "lemma", "upos", "feats", "head", "deprel", "deps", "misc"])
    dataset.rename_column_('xpos', 'labels')

    tokenizer = AutoTokenizer.from_pretrained(model.hparams.transformer, use_fast=True)
Ejemplo n.º 9
0
import torch.utils.data
from pytorch_lightning import Trainer

import ltp
from ltp import (optimization, task_segmention, task_part_of_speech,
                 task_named_entity_recognition, task_dependency_parsing,
                 task_semantic_dependency_parsing, task_semantic_role_labeling)
from ltp.data import dataset as datasets
from ltp.data.utils import collate, MultiTaskDataloader
from ltp.transformer_multitask import TransformerMultiTask as Model
from ltp.utils import TaskInfo, common_train
from ltp.utils import deploy_model

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='multitask', metric_name='metric_mean')

# CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python ltp/multitask.py --max_epochs=10 --batch_size=16 --gpus=1 --precision=16 --seg_data_dir=data/seg --pos_data_dir=data/pos --ner_data_dir=data/ner

task_builder = {
    task_segmention.task_info.task_name: task_segmention,
    task_part_of_speech.task_info.task_name: task_part_of_speech,
    task_named_entity_recognition.task_info.task_name:
    task_named_entity_recognition,
    task_dependency_parsing.task_info.task_name: task_dependency_parsing,
    task_semantic_dependency_parsing.task_info.task_name:
    task_semantic_dependency_parsing,
    task_semantic_role_labeling.task_info.task_name:
    task_semantic_role_labeling,
}
Ejemplo n.º 10
0
import torch.utils.data
import torch.nn.functional as F

from pytorch_lightning import Trainer

import ltp
from ltp import optimization, multitask
from ltp.data import dataset as datasets
from ltp.data.utils import collate, MultiTaskDataloader
from ltp.transformer_multitask import TransformerMultiTask as Model
from ltp.utils import TaskInfo, common_train, deploy_model
from ltp.multitask import validation_method

os.environ['TOKENIZERS_PARALLELISM'] = 'true'

task_info = TaskInfo(task_name='distill', metric_name='metric_mean')


def kd_ce_loss(logits_S, logits_T, temperature=1):
    '''
    Calculate the cross entropy between logits_S and logits_T

    :param logits_S: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
    :param logits_T: Tensor of shape (batch_size, length, num_labels) or (batch_size, num_labels)
    :param temperature: A float or a tensor of shape (batch_size, length) or (batch_size,)
    '''
    if isinstance(temperature, torch.Tensor) and temperature.dim() > 0:
        temperature = temperature.unsqueeze(-1)
    beta_logits_T = logits_T / temperature
    beta_logits_S = logits_S / temperature
    p_T = F.softmax(beta_logits_T, dim=-1)