Ejemplo n.º 1
0
def define_model(name, config=None, location=None):
    # config가 있으면 처음 training하는 경우, 없으면 체크포인트 불러오기
    if name in [
            "bert-base-multilingual-cased",
            "sangrimlee/bert-base-multilingual-cased-korquad",
            "kykim/bert-kor-base", "monologg/kobert"
    ]:
        return BertForSequenceClassification.from_pretrained(
            name, config=config
        ) if config else BertForSequenceClassification.from_pretrained(
            location)
    elif name in [
            "monologg/koelectra-base-v3-discriminator",
            "kykim/electra-kor-base"
    ]:
        return ElectraForSequenceClassification.from_pretrained(
            name, config=config
        ) if config else ElectraForSequenceClassification.from_pretrained(
            location)
    elif name in ["xlm-roberta-large"]:
        return XLMRobertaForSequenceClassification.from_pretrained(
            name, config=config
        ) if config else XLMRobertaForSequenceClassification.from_pretrained(
            location)
    elif name in ["kykim/funnel-kor-base"]:
        return FunnelForSequenceClassification.from_pretrained(
            name, config=config
        ) if config else FunnelForSequenceClassification.from_pretrained(
            location)
Ejemplo n.º 2
0
def load_model(device):
    checkpoint = os.path.dirname(os.path.realpath(__file__))+'/ckpt/koelectra-base-v3-ckpt1/s2_checkpoint-7300'
    model = ElectraForSequenceClassification.from_pretrained(checkpoint)
    model.to(device)
    
    print('model loaded')
    return model
Ejemplo n.º 3
0
def get_text_reader(reader_name, task, num_labels):
    # AILAW Corpus is korean dataset.
    # So, model is fixed to Korean Model such as multilingual-BERT, kobert, koelectra, etc.

    if reader_name == "bert":
        if task == "classification":
            model_name = "bert-base-multilingual-cased"
            text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "bert-base-multilingual-cased"
            text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    elif reader_name == "kobert":
        if task == "classification":
            model_name = "monologg/kobert"
            text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "monologg/kobert"
            text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    elif reader_name == "koelectra":
        if task == "classification":
            model_name = "monologg/koelectra-base-discriminator"
            text_reader = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        else: # ner
            model_name = "monologg/koelectra-base-discriminator"
            text_reader = ElectraForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    else:
        raise KeyError(reader_name)

    return text_reader
Ejemplo n.º 4
0
 def _init_deep_model(self, model_type, model_path, num_labels, num_regs=None):
     if 'roberta' in model_type:
         tokenizer = RobertaTokenizer.from_pretrained(model_path)
         config = RobertaConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         model = RobertaForSequenceClassification.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     elif 'electra_multitask' in model_type:
         tokenizer = ElectraTokenizer.from_pretrained(model_path)
         tokenizer.add_special_tokens({'additional_special_tokens': ['[VALUES]']})
         config = ElectraConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         config.num_regs = num_regs
         config.vocab_size = len(tokenizer)
         model = ElectraForSequenceClassificationMultiTask.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     elif 'electra' in model_type:
         tokenizer = ElectraTokenizer.from_pretrained(model_path)
         config = ElectraConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         model = ElectraForSequenceClassification.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     else:
         raise NotImplementedError()
     return config, tokenizer, model
Ejemplo n.º 5
0
def main(args):
  """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
  # load tokenizer
  TOK_NAME = "monologg/koelectra-base-v3-discriminator" 
  #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)
  tokenizer = ElectraTokenizer.from_pretrained(TOK_NAME)

  # load my model
  MODEL_NAME = args.model_dir # model dir.
  model = ElectraForSequenceClassification.from_pretrained(args.model_dir)
  model.parameters
  model.to(device)

  # load test datset
  test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
  test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
  test_dataset = RE_Dataset(test_dataset ,test_label)

  # predict answer
  logits, predictions = inference(model, test_dataset, device)
  # make csv file with predicted answer
  # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

  output = pd.DataFrame(predictions, columns=['pred'])
  output.to_csv('./prediction/koelectra-submission6.csv', index=False)
def predict_pair(model_args, data_args, training_args):
    # Set seed
    set_seed(training_args.seed)

    if 'roberta' in model_args.model_type:
        tokenizer = RobertaTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = RobertaConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = RobertaForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)
    elif 'electra' in model_args.model_type:
        tokenizer = ElectraTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = ElectraConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = ElectraForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)
    else:
        # default -> bert
        tokenizer = BertTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = BertConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = BertForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)

    model.to(training_args.device)

    test_df = pickle.load(open(data_args.test_data_file, 'rb'))
    test_dataset = get_dataset(data_args, tokenizer, test_df, model_args.model_type)
    data_collator = MyDataCollator()
    if training_args.local_rank != -1:
        sampler = SequentialDistributedSampler(test_dataset)
        model = torch.nn.DataParallel(model)
    else:
        n_gpu = torch.cuda.device_count()
        if n_gpu > 1:
            model = torch.nn.DataParallel(model)
        sampler = SequentialSampler(test_dataset)
    print(len(test_dataset))
    dataloader = DataLoader(
        test_dataset,
        sampler=sampler,
        batch_size=training_args.eval_batch_size,
        collate_fn=data_collator,
    )

    model.eval()
    all_probs = []
    for inputs in tqdm(dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(training_args.device)
        inputs.pop('labels')
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs[0]
            probs = torch.softmax(logits, dim=-1)
            maxp, maxi = torch.max(probs, dim=-1)
            result = [(_i, _p) for _p, _i in zip(maxp, maxi)]
            all_probs.extend(result)

    with open('./{}_{}.answer_classify.result'.format(data_args.data_type, model_args.model_type), 'w', encoding='utf-8') as fout:
        for i in range(len(test_df)):
            fout.write('{} | {} | {} | {} | {}\n'.format(test_df[i][0], test_df[i][1], test_df[i][2], all_probs[i][0], all_probs[i][1]))
Ejemplo n.º 7
0
def main(task='mrpc',
         base_train_cfg='config/QDElectra_pretrain.json',
         train_cfg='config/train_mrpc.json',
         model_cfg='config/QDElectra_base.json',
         data_file='../glue/MRPC/train.tsv',
         model_file=None,
         data_parallel=True,
         vocab='../uncased_L-12_H-768_A-12/vocab.txt',
         log_dir='../exp/electra/pretrain/runs',
         save_dir='../exp/bert/mrpc',
         mode='train',
         pred_distill=True):
    train_cfg_dict = json.load(open(base_train_cfg, "r"))
    train_cfg_dict.update(json.load(open(train_cfg, "r")))
    train_cfg = ElectraConfig().from_dict(train_cfg_dict)
    # train_cfg = ElectraConfig().from_json_file(train_cfg)
    model_cfg = ElectraConfig().from_json_file(model_cfg)
    output_mode, train_cfg.n_epochs, max_len = get_task_params(task)
    set_seeds(train_cfg.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True)
    TaskDataset = dataset_class(task) # task dataset class according to the task
    num_labels = len(TaskDataset.labels)
    pipeline = [
        Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
        AddSpecialTokensWithTruncation(max_len),
        TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, output_mode, max_len)
    ]
    data_set = TaskDataset(data_file, pipeline)
    data_iter = DataLoader(data_set, batch_size=train_cfg.batch_size, shuffle=True)

    t_discriminator = ElectraForSequenceClassification.from_pretrained(
        'google/electra-base-discriminator'
    )
    s_discriminator = QuantizedElectraForSequenceClassification.from_pretrained(
        'google/electra-small-discriminator', config=model_cfg
    )
    model = DistillElectraForSequenceClassification(t_discriminator, s_discriminator, model_cfg)

    optimizer = optim.optim4GPU(train_cfg, model)
    writer = SummaryWriter(log_dir=log_dir) # for tensorboardX

    base_trainer_args = (train_cfg, model_cfg, model, data_iter, optimizer, save_dir, get_device())
    trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args)

    if mode == 'train':
        trainer.train(model_file, None, data_parallel)
    elif mode == 'eval':
        input_ids, attention_mask, token_type_ids, label_ids = TokenIndexing(tokenizer.convert_tokens_to_ids,
                                                                            TaskDataset.labels,
                                                                            output_mode,
                                                                            max_len)
        _, eval_labels = get_tensor_data(output_mode, input_ids, attention_mask, token_type_ids, label_ids)
        results = trainer.eval(model_file, output_mode, eval_labels, num_labels, data_parallel)
        total_accuracy = torch.cat(results).mean().item()
        print('Accuracy:', total_accuracy)
Ejemplo n.º 8
0
def index():

    if request.values.get("txt"):
        from transformers import AutoTokenizer, AutoModel, ElectraForSequenceClassification

        tokenizer = AutoTokenizer.from_pretrained(
            "/srv/electra-ka-fake-news-tagging/")
        model = ElectraForSequenceClassification.from_pretrained(
            "/srv/electra-ka-fake-news-tagging/")
        inputs = tokenizer(request.values.get("txt"), return_tensors="pt")
        return str(model(**inputs)[0].tolist())
    return 'no text was sent'
Ejemplo n.º 9
0
    def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import ElectraTokenizer, ElectraForSequenceClassification
        # download the model or load the model path
        model_path = download_model('electra.offensive',
                                    cache_dir,
                                    process_func=_unzip_process_func,
                                    verbose=verbose)

        self.classes = ['NOT', 'OFF']

        self.tokenizer = ElectraTokenizer.from_pretrained(model_path)
        self.model = ElectraForSequenceClassification.from_pretrained(
            model_path, num_labels=len(self.classes))

        self.max_length = self.model.electra.embeddings.position_embeddings.num_embeddings
Ejemplo n.º 10
0
    def __init__(self, batch_size, output_size, hidden_size):
        super(KEA_ELECTRA, self).__init__()

        options_name = "google/electra-base-discriminator"
        self.encoder = ElectraForSequenceClassification.from_pretrained(
            options_name, num_labels=output_size)

        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size

        self.a = nn.Linear(512, hidden_size)  #512 is the size of lexicon_vec
        self.v = nn.Linear(512, hidden_size)  #512 is the size of lexicon_vec
        self.d = nn.Linear(512, hidden_size)  #512 is the size of lexicon_vec

        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(hidden_size, 384)
        self.label = nn.Linear(384, output_size)
Ejemplo n.º 11
0
    def __init__(self, batch_size, output_size, hidden_size):
        super(KEA_Electra_Word_level, self).__init__()

        options_name = "google/electra-base-discriminator"
        self.encoder = ElectraForSequenceClassification.from_pretrained(
            options_name, num_labels=output_size)

        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size

        self.bilstm = nn.LSTM(hidden_size + 3,
                              int(hidden_size / 2),
                              dropout=0.2,
                              bidirectional=True)
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(hidden_size, 384)
        self.label = nn.Linear(384, output_size)
Ejemplo n.º 12
0
    def __init__(self,
                 input_ids_batch,
                 attention_mask,
                 hidden_size=768,
                 num_classes=6,
                 dr_rate=None,
                 params=None):
        super(ElectraClassifier, self).__init__()
        self.dr_rate = dr_rate
        self.device = torch.device(
            "cuda:0") if torch.cuda.is_available() else torch.device("cpu")
        self.electramodel = ElectraForSequenceClassification.from_pretrained(
            "monologg/koelectra-small-v2-discriminator")
        self.attention_mask = attention_mask
        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:

            self.dropout = nn.Dropout(p=dr_rate)
Ejemplo n.º 13
0
 def __call_model_torch(self):
     if self.model_to_use.lower() == 'bert':
         self.config = BertConfig(num_labels=2)
         self.model = BertForSequenceClassification.from_pretrained(
             'bert-base-uncased', config=self.config)
     elif self.model_to_use.lower() == 'albert':
         self.config = AlbertConfig(num_labels=2)
         self.model = AlbertForSequenceClassification.from_pretrained(
             'albert-base-v1', config=self.config)
     elif self.model_to_use.lower() == 'electra':
         self.config = ElectraConfig(num_labels=2)
         self.model = ElectraForSequenceClassification.from_pretrained(
             'google/electra-small-discriminator', config=self.config)
     elif self.model_to_use.lower() == 'distilbert':
         self.config = DistilBertConfig(num_labels=2)
         self.model = DistilBertForSequenceClassification.from_pretrained(
             'distilbert-base-uncased', config=self.config)
     else:
         print('Model not avaiable yet.')
Ejemplo n.º 14
0
 def __init__(self,
              config: ElectraConfig,
              embeddings,
              discriminator=None,
              embed_layer=None):
     super().__init__()
     self.embed_layer = nn.Embedding(num_embeddings=config.vocab_size,
                                     embedding_dim=config.embedding_size,
                                     padding_idx=config.vocab_size - 1)
     if embed_layer:
         self.embed_layer.load_state_dict(torch.load(embed_layer))
     else:
         self.embed_layer.weight = nn.Parameter(embeddings)
     if discriminator:
         self.discriminator = ElectraForSequenceClassification.from_pretrained(
             discriminator, config=config)
     else:
         self.discriminator = ElectraForSequenceClassification(config)
     self.softmax = nn.Softmax(1)
 def __init__(
     self,
     model_dir,
     vocab_dir="skplanet/dialog-koelectra-small-discriminator",
     label_list=None,
     cuda=False,
 ):
     if cuda:
         device = "cuda" if torch.cuda.is_available() else "cpu"
     else:
         device = "cpu"
     self.device = torch.device(device)
     self.model = ElectraForSequenceClassification.from_pretrained(model_dir)
     self.model.to(self.device)
     self.model.eval()
     self.tokenizer = ElectraTokenizer.from_pretrained(
         vocab_dir, do_lower_case=False
     )
     self.label_list = None
     if label_list:
         self.label_list = label_list
Ejemplo n.º 16
0
 def call(self):
     if self.model_to_use.lower() == 'bert':
         self.model = BertForSequenceClassification.from_pretrained(
             'bert-base-uncased',
             num_labels=2,
             output_attentions=False,
             output_hidden_states=False)
         print('Bert Cargado.')
         print(self.model)
     elif self.model_to_use.lower() == 'albert':
         self.model = AlbertForSequenceClassification.from_pretrained(
             'albert-base-v1',
             num_labels=2,
             output_attentions=False,
             output_hidden_states=False)
     elif self.model_to_use.lower() == 'electra':
         self.model = ElectraForSequenceClassification.from_pretrained(
             'google/electra-small-discriminator',
             num_labels=2,
             output_attentions=False,
             output_hidden_states=False)
     elif self.model_to_use.lower() == 'distilbert':
         self.model = DistilBertForSequenceClassification.from_pretrained(
             'distilbert-base-uncased',
             num_labels=2,
             output_attentions=False,
             output_hidden_states=False)
     else:
         print('Model not avaiable right now.')
     self.model.to(self.device)
     self.optimizer = AdamW(self.model.parameters(),
                            lr=self.learning_rate,
                            eps=self.epsilon)
     self.total_steps = len(self.train_dataloader) * self.epochs
     self.scheduler = get_linear_schedule_with_warmup(
         self.optimizer,
         num_warmup_steps=0,
         num_training_steps=self.total_steps)
Ejemplo n.º 17
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    TOK_NAME = args.model_name
    tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)

    # load my model
    model_dir = f'./results/{args.id}/checkpoint-{args.checkpoint}'
    if args.model_type == 'bert':
        model = BertForSequenceClassification.from_pretrained(model_dir)
    elif args.model_type == 'electra':
        model = ElectraForSequenceClassification.from_pretrained(model_dir)
    elif args.model_type == 'roberta':
        model = XLMRobertaForSequenceClassification.from_pretrained(model_dir)
    model.parameters
    model.to(device)

    # load test datset
    # root = "/opt/ml"
    # root = "/content/drive/MyDrive/Boostcamp/Stage2_KLUE"
    root = args.root
    test_dataset, test_label = load_test_dataset(root, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)
    # logits, predictions = inference(model, test_dataset, device)

    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.
    output = pd.DataFrame(pred_answer, columns=['pred'])
    # output = pd.DataFrame(predictions, columns=['pred'])
    output.to_csv(f'./results/{args.id}/submission{args.id}.csv', index=False)
    # np.save(f'./results/{args.id}/logits{args.id}.npy', logits)
    print('File saved')
Ejemplo n.º 18
0
 def pretrained_tokenizer_and_model(self):
     print(f'Model Class : {self.model_type}')
     if self.model_type == 'bert':
         pretrained_model = 'bert-base-uncased'
         self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
         self.model = BertForSequenceClassification.from_pretrained(
             pretrained_model, num_labels=self.labels_count)
     elif self.model_type == 'roberta':
         pretrained_model = 'roberta-base'
         self.tokenizer = RobertaTokenizer.from_pretrained(pretrained_model)
         self.model = RobertaForSequenceClassification.from_pretrained(
             pretrained_model, num_labels=self.labels_count)
     elif self.model_type == 'distilbert':
         pretrained_model = 'distilbert-base-uncased'
         self.tokenizer = DistilBertTokenizer.from_pretrained(
             pretrained_model)
         self.model = DistilBertForSequenceClassification.from_pretrained(
             pretrained_model, num_labels=self.labels_count)
     elif self.model_type == 'electra':
         pretrained_model = 'google/electra-small-discriminator'
         self.tokenizer = ElectraTokenizer.from_pretrained(pretrained_model)
         self.model = ElectraForSequenceClassification.from_pretrained(
             pretrained_model, num_labels=self.labels_count)
     if self.device.type == 'cuda': self.model.cuda()
from transformers import ElectraForSequenceClassification, ElectraTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np

model = ElectraForSequenceClassification.from_pretrained(
    'google/electra-small-discriminator')
tokenizer = ElectraTokenizerFast.from_pretrained(
    'google/electra-small-discriminator')

import random


def tokenize(batch):
    return tokenizer(batch['text'],
                     truncation=True,
                     max_length=256,
                     add_special_tokens=True,
                     padding='max_length',
                     return_attention_mask=True)


train_dataset = load_dataset(
    'json',
    data_files={'train': 'dataset_full_question/quanta_train.json'},
    field='questions')['train']
train_dataset = train_dataset.map(
    lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]})
train_dataset = train_dataset.map(tokenize,
                                  batched=True,
                                  batch_size=len(train_dataset))
Ejemplo n.º 20
0
def main(args):

    nsmc = h5py.File(f'{args.data_root}/nsmc.h5', 'r')

    train_dataset = nsmc['train']
    test_dataset = nsmc['test']

    print('\n====================== Dataset Summary ======================\n')
    print(f"Train Label : {train_dataset['label']}")
    print(f"Train Input Ids : {train_dataset['input_ids']}")
    print(f"Train Attention Mask : {train_dataset['attention_mask']}")
    print(f"Test Label : {test_dataset['label']}")
    print(f"Test Input Ids : {test_dataset['input_ids']}")
    print(f"Test Attention Mask : {test_dataset['attention_mask']}")
    print('\n=============================================================\n')

    train_label = np.array(train_dataset['label'])
    train_input_ids = np.array(train_dataset['input_ids'])
    train_attention_mask = np.array(train_dataset['attention_mask'])

    test_label = np.array(test_dataset['label'])
    test_input_ids = np.array(test_dataset['input_ids'])
    test_attention_mask = np.array(test_dataset['attention_mask'])

    nsmc.close()

    train_dataset = NSMCDataset(train_label, train_input_ids,
                                train_attention_mask)
    test_dataset = NSMCDataset(test_label, test_input_ids, test_attention_mask)

    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.n_workers)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.n_workers)

    if torch.cuda.is_available() and args.cuda:
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    model = ElectraForSequenceClassification.from_pretrained(
        "monologg/koelectra-base-v3-discriminator")
    model = nn.parallel.DataParallel(model)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=args.lr)

    # Plot Loss and Images in Tensorboard
    experiment_dir = 'logs/{}@{}'.format(
        'NSMC',
        datetime.now().strftime("%d.%m.%Y-%H:%M:%S"))
    os.makedirs(f"{experiment_dir}/checkpoints", exist_ok=True)
    writer = SummaryWriter(os.path.join(experiment_dir, "tb"))

    metric_dict = defaultdict(list)
    metric_dict_epoch_train = defaultdict(list)
    metric_dict_epoch_test = defaultdict(list)

    ##########################################
    ################ Training ################
    ##########################################

    n_iters_total = 0

    for epoch in range(args.n_epochs):

        total_loss_train = 0.0
        correct = 0
        total = 0
        model.train()

        for idx, (label, input_ids,
                  attention_masks) in tqdm(enumerate(train_loader),
                                           total=len(train_loader)):

            optimizer.zero_grad()

            label = label.to(device)
            input_ids = input_ids.to(device)
            attention_masks = attention_masks.to(device)

            output = model(input_ids, attention_masks)[0]  # (batch_size, 2)
            _, pred = torch.max(output, 1)  # (batch_size)

            loss = F.cross_entropy(output, label)
            loss.backward()
            optimizer.step()

            total_loss_train += loss.item()
            correct += (pred == label).sum()
            total += len(label)
            train_accuracy = correct.float() / total

            if n_iters_total % 300 == 0:
                print(f"Batch Loss : {loss} / Accuracy : {train_accuracy}")

            metric_dict['train_loss'].append(loss.item())
            metric_dict['train_accuracy'].append(train_accuracy.item())
            n_iters_total += 1

            for title, value in metric_dict.items():
                writer.add_scalar('train/{}'.format(title), value[-1],
                                  n_iters_total)

        train_accuracy = correct.float() / total
        metric_dict_epoch_train['train_total_loss_epoch'].append(
            total_loss_train)
        metric_dict_epoch_train['train_accuracy_epoch'].append(train_accuracy)

        for title, value in metric_dict_epoch_train.items():
            writer.add_scalar('train/{}'.format(title), value[-1], epoch)

        print(
            f"Epoch : {epoch} / Train Loss : {total_loss_train} / Accuracy : {train_accuracy}"
        )

        ##########################################
        ################## Test ##################
        ##########################################

        test_correct = 0
        test_total = 0
        total_loss_test = 0.0
        model.eval()

        with torch.no_grad():
            for idx, (label, input_ids,
                      attention_masks) in tqdm(enumerate(test_loader),
                                               total=len(test_loader)):

                label = label.to(device)
                input_ids = input_ids.to(device)
                attention_masks = attention_masks.to(device)

                output = model(input_ids, attention_masks)[0]
                _, pred = torch.max(output, 1)  # values, indices

                loss = F.cross_entropy(output, label)
                total_loss_test += loss
                test_correct += (pred == label).sum()
                test_total += len(label)

            test_accuracy = test_correct.float() / test_total
            metric_dict_epoch_test['test_total_loss_epoch'].append(
                total_loss_test)
            metric_dict_epoch_test['test_accuracy_epoch'].append(test_accuracy)

            for title, value in metric_dict_epoch_test.items():
                writer.add_scalar('test/{}'.format(title), value[-1], epoch)

            print(f"Test Accuracy : {test_accuracy}")

            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': model.module.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'train_accuracy': train_accuracy,
                    'test_accuracy': test_accuracy,
                }, os.path.join(experiment_dir, "checkpoints", str(epoch)))
Ejemplo n.º 21
0
def model_setting():
    model = ElectraForSequenceClassification.from_pretrained(MODEL_NAME).to(
        device)
    #model.load_state_dict(torch.load(pre_MODEL_NAME))
    model.load_state_dict(torch.load(pre_MODEL_NAME, map_location=device))
    return model
Ejemplo n.º 22
0
def main():

    nsmc = h5py.File('../data/nsmc.h5', 'r')

    train_dataset = nsmc['train']
    test_dataset = nsmc['test']

    print('\n====================== Dataset Summary ======================\n')
    print(f"Train Label : {train_dataset['label']}")
    print(f"Train Input Ids : {train_dataset['input_ids']}")
    print(f"Train Attention Mask : {train_dataset['attention_mask']}")

    print(f"Test Label : {test_dataset['label']}")
    print(f"Test Input Ids : {test_dataset['input_ids']}")
    print(f"Test Attention Mask : {test_dataset['attention_mask']}")
    print('\n=============================================================\n')

    train_label = np.array(train_dataset['label'])
    train_input_ids = np.array(train_dataset['input_ids'])
    train_attention_mask = np.array(train_dataset['attention_mask'])

    test_label = np.array(test_dataset['label'])
    test_input_ids = np.array(test_dataset['input_ids'])
    test_attention_mask = np.array(test_dataset['attention_mask'])

    nsmc.close()

    train_dataset = NSMCDataset(train_label, train_input_ids,
                                train_attention_mask)
    test_dataset = NSMCDataset(test_label, test_input_ids, test_attention_mask)

    train_loader = DataLoader(train_dataset,
                              batch_size=55,
                              shuffle=True,
                              num_workers=8)
    test_loader = DataLoader(test_dataset,
                             batch_size=55,
                             shuffle=False,
                             num_workers=8)

    model = ElectraForSequenceClassification.from_pretrained(
        "monologg/koelectra-base-v3-discriminator")
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    args = add_argument()

    model_engine, _, _, _ = deepspeed.initialize(args=args,
                                                 model=model,
                                                 model_parameters=parameters)

    losses = []
    accuracies = []

    for epoch in range(args.epochs):

        total_loss = 0.0
        correct = 0
        total = 0
        batches = 0

        for idx, (label, input_ids,
                  attention_masks) in tqdm(enumerate(train_loader),
                                           total=len(train_loader)):

            label = label.to(model_engine.local_rank)
            input_ids = input_ids.to(model_engine.local_rank)
            attention_masks = attention_masks.to(model_engine.local_rank)

            # Model Inference
            output = model_engine(input_ids, attention_masks)[0]
            _, pred = torch.max(output, 1)
            loss = F.cross_entropy(output, label)

            model_engine.backward(loss)
            model_engine.step()

            total_loss += loss.item()
            correct += (pred == label).sum()
            total += len(label)

            batches += 1

            if batches % 100 == 0:
                print(
                    f"Batch Loss : {total_loss} / Accuracy : {correct.float() / total}"
                )

        losses.append(total_loss)
        accuracies.append(correct.float() / total)
        print(
            f"Epoch : {epoch} / Train Loss : {total_loss} / Accuracy : {correct.float() / total}"
        )

        test_correct = 0
        test_total = 0

        with torch.no_grad():

            for idx, (label, input_ids,
                      attention_masks) in tqdm(enumerate(test_loader),
                                               total=len(test_loader)):

                label = label.to(model_engine.local_rank)
                input_ids = input_ids.to(model_engine.local_rank)
                attention_masks = attention_masks.to(model_engine.local_rank)

                # Model Inference
                output = model_engine(input_ids, attention_masks)[0]
                _, pred = torch.max(output, 1)

                test_correct += (pred == label).sum()
                test_total += len(label)

        print(f"Test Accuracy : {test_correct.float() / test_total}")
        model_engine.save_checkpoint('../weights', f"KoELECTRA_{epoch}")
Ejemplo n.º 23
0
def main():
    args = parse_args()
    os.makedirs(args.output_dir, exist_ok=True)
    set_seed(args.seed)
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s :: %(levelname)s :: %(message)s')

    if args.numnet_model is not None:
        config = BertConfig.from_pretrained(
            args.model_name, num_labels=1)  # 1 label for regression
        # if args.contrastive:
        #     model = ContrastiveElectra.from_pretrained(args.model_name, config=config)
        # else:
        model = BertForSequenceClassification.from_pretrained(args.model_name,
                                                              config=config)
        state_dicts = torch.load(args.numnet_model)
        if "model" in state_dicts:
            logging.info("Loading in mutual electra format state_dicts.")
            model.load_state_dict(state_dicts["model"], strict=False)
        else:
            logging.info("Loading model weights only.")
            model.load_state_dict(state_dicts, strict=False)
    else:
        config = ElectraConfig.from_pretrained(
            args.model_name, num_labels=1)  # 1 label for regression
        model = ElectraForSequenceClassification.from_pretrained(
            args.model_name, config=config)
        if args.local_model_path is not None:
            state_dicts = torch.load(args.local_model_path)
            model.load_state_dict(state_dicts["model"])

    tokenizer = ElectraTokenizer.from_pretrained(args.model_name,
                                                 do_lower_case=True)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # TODO enable multi-gpu training if necessary
    pretrain_train_dataset = DapoDataset(args.data_dir, "train",
                                         tokenizer) if args.pretrain else None
    pretrain_dev_dataset = DapoDataset(args.data_dir, "dev",
                                       tokenizer) if args.pretrain else None

    if args.train:
        if args.contrastive:
            train_dataset = ContrastiveDataset(args.data_dir, "train",
                                               tokenizer)
            train_dataloader = DataLoader(train_dataset,
                                          batch_size=args.train_batch_size,
                                          shuffle=False,
                                          num_workers=8,
                                          collate_fn=mutual_contrast_collate)
            dev_dataset = ContrastiveDataset(
                args.data_dir, "dev",
                tokenizer) if args.eval or args.test else None
            dev_dataloader = DataLoader(dev_dataset,
                                        batch_size=args.train_batch_size,
                                        shuffle=False,
                                        num_workers=8,
                                        collate_fn=mutual_contrast_collate
                                        ) if dev_dataset is not None else None
        else:
            train_dataset = MutualDataset(args.data_dir, "train", tokenizer)
            train_dataloader = DataLoader(train_dataset,
                                          batch_size=args.train_batch_size,
                                          shuffle=True,
                                          num_workers=8,
                                          collate_fn=mutual_collate)
            dev_dataset = MutualDataset(
                args.data_dir, "dev",
                tokenizer) if args.eval or args.test else None
            dev_dataloader = DataLoader(
                dev_dataset,
                batch_size=args.train_batch_size,
                shuffle=False,
                num_workers=8,
                collate_fn=mutual_collate) if dev_dataset is not None else None

    else:
        train_dataset, train_dataloader = None, None

    # TODO: add test_dataset if we want to submit to leaderboard

    pretrain_train_dataloader = DataLoader(
        pretrain_train_dataset,
        batch_size=args.train_batch_size,
        shuffle=True,
        num_workers=8,
        collate_fn=dapo_collate
    ) if pretrain_train_dataset is not None else None
    pretrain_dev_dataloader = DataLoader(
        pretrain_dev_dataset,
        batch_size=args.train_batch_size,
        shuffle=False,
        num_workers=8,
        collate_fn=dapo_collate) if pretrain_dev_dataset is not None else None

    # currently eval_batch_size = train_batch_size

    if args.pretrain:
        logging.info("Start pretraining...")
        args.eval = True
        trainer = Trainer(args, model, device, pretrain_train_dataloader,
                          pretrain_dev_dataloader)
        trainer.train()
        return  # fine-tuning should be done separately

    if args.train:
        logging.info("Start training...")
        trainer = Trainer(args, model, device, train_dataloader,
                          dev_dataloader)
        trainer.train()

    # TODO: currently testing is on the dev set
    if args.test:
        logging.info("Start testing...")
        tester = Tester(args, model, device, dev_dataset, dev_dataloader)
        tester.test()
def main(args):
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/train.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    # Set the seed value all over the place to make this reproducible.
    seed_val = args.seed
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    # Choose device
    device = get_default_device()

    prompts_train_idxs = np.loadtxt(args.train_prompts_idxs_path, dtype=np.int64)
    topics_dist = np.loadtxt(args.unique_prompts_distribution_path, dtype=np.int32)

    # Normalise
    topics_dist = topics_dist / np.linalg.norm(topics_dist, 1)

    # Load the BERT tokenizer.
    print('Loading BERT tokenizer...')
    tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator', do_lower_case=True)

    with open(args.unique_prompts_path) as f:
        topics = f.readlines()
    # Remove whitespaces and convert to lowercase
    topics = [x.strip().lower() for x in topics]

    with open(args.train_resps_path) as f:
        responses = f.readlines()
    # Remove whitespaces and convert to lower case
    responses = [x.strip().lower() for x in responses]

    # Tokenize all the prompts and the responses and then map the tokens to their word IDs
    topic_ids = []
    for sent in topics:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        topic_ids.append(encoded_sent)

    resp_ids = []
    for sent in responses:
        encoded_sent = tokenizer.encode(sent, add_special_tokens=True)
        resp_ids.append(encoded_sent)
    
    MAX_LEN_topic = max([len(sen) for sen in topic_ids])
    MAX_LEN_resp = max([len(sen) for sen in resp_ids])
    print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

    # Pad our input tokens with value 0.
    # "post" indicates that we want to pad and truncate at the end of the sequence,
    # as opposed to the beginning.
    topic_ids = pad_sequences(topic_ids, maxlen=MAX_LEN_topic, dtype="long", 
                            value=0, truncating="post", padding="post")

    resp_ids = pad_sequences(resp_ids, maxlen=MAX_LEN_resp, dtype="long", 
                            value=0, truncating="post", padding="post")

    # The attention mask simply makes it explicit which tokens are actual words versus which are padding.
    attention_masks_topic = []
    # For each sentence...
    for sent in topic_ids:
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]
        # Store the attention mask for this sentence.
        attention_masks_topic.append(att_mask)
    attention_masks_resp = []
    for sent in resp_ids:
        # Create the attention mask.
        #   - If a token ID is 0, then it's padding, set the mask to 0.
        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
        att_mask = [int(token_id > 0) for token_id in sent]
        # Store the attention mask for this sentence.
        attention_masks_resp.append(att_mask)

    # Convert to torch tensors

    prompts_train_idxs = torch.from_numpy(prompts_train_idxs)
    prompts_train_idxs = prompts_train_idxs.long()

    topic_ids = torch.tensor(topic_ids)
    topic_ids = topic_ids.long()
    topic_ids = topic_ids.to(device)

    attention_masks_topic = torch.tensor(attention_masks_topic)
    attention_masks_topic = attention_masks_topic.long()
    attention_masks_topic = attention_masks_topic.to(device)

    resp_ids = torch.tensor(resp_ids)
    resp_ids = resp_ids.long()
    resp_ids = resp_ids.to(device)

    attention_masks_resp = torch.tensor(attention_masks_resp)
    attention_masks_resp = attention_masks_resp.long()
    attention_masks_resp = attention_masks_resp.to(device)

    # Create the DataLoader for our training set.
    print(prompts_train_idxs.size(0))
    print(resp_ids.size(0))
    print(attention_masks_resp.size(0))
    train_data = TensorDataset(prompts_train_idxs, resp_ids, attention_masks_resp)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size)

    # Load BertForSequenceClassification, the pretrained BERT model with a single 
    # linear classification layer on top. 
    model = ElectraForSequenceClassification.from_pretrained(
        "google/electra-base-discriminator", # Use the 12-layer BERT model, with an uncased vocab.
        num_labels = 2, # The number of output labels--2 for binary classification.
                        # You can increase this for multi-class tasks.   
        output_attentions = False, # Whether the model returns attentions weights.
        output_hidden_states = False, # Whether the model returns all hidden-states.
    )
    model.to(device)

    # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
    # I believe the 'W' stands for 'Weight Decay fix"
    optimizer = AdamW(model.parameters(),
                    lr = args.learning_rate,
                    eps = args.adam_epsilon
                    )

    loss_values = []

    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * args.n_epochs
    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)


    for epoch in range(args.n_epochs):
        # Perform one full pass over the training set.
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, args.n_epochs))
        print('Training...')
        # Measure how long the training epoch takes.
        t0 = time.time()
        # Reset the total loss for this epoch.
        total_loss = 0
        model.train()      
    # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            p_id = batch[0].to(device)
            r = batch[1].to(device)
            r_msk = batch[2].to(device)
            # Perform dynamic shuffling
            p_id, r, r_msk, y_true, batch_size = _shuffle(p_id, r, r_msk, topics_dist, args.num_topics, device)           
            # Get the prompts from the topics
            p, p_msk = _get_prompts(p_id, topic_ids, attention_masks_topic)
            p, p_msk = p.to(device), p_msk.to(device)
            # Concatenate prompts and responses
            pr_resp, pr_resp_msk = _join_pr_resp(p, p_msk, r, r_msk, args.reverse)
            pr_resp, pr_resp_msk = pr_resp.to(device), pr_resp_msk.to(device)
            model.zero_grad()

            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because we
            # have provided the `labels`.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(pr_resp, token_type_ids=None, attention_mask=pr_resp_msk, labels=y_true)
            
            # The call to `model` always returns a tuple, so we need to pull the 
            # loss value out of the tuple.
            loss = outputs[0]
            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            total_loss += loss.item()
            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()
            # Update the learning rate.
            scheduler.step()
        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)            
        
        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

        # NEED TO DO THE VALIDATION CODE NOW - see the rest of the tutorial at
        # https://medium.com/@aniruddha.choudhury94/part-2-bert-fine-tuning-tutorial-with-pytorch-for-text-classification-on-the-corpus-of-linguistic-18057ce330e1

    # Save the model to a file
    file_path = args.save_path+'electra_classifier_seed'+str(args.seed)+'.pt'
    torch.save(model, file_path)
Ejemplo n.º 25
0
import os

import torch
import numpy as np
from flask import Flask, request, jsonify
from transformers import (ElectraTokenizer, ElectraForSequenceClassification)

app = Flask(__name__)
device = "cuda" if torch.cuda.is_available() else "cpu"

max_seq_length = int(os.getenv("PODOLI_MAX_LENGTH", 128))
model = ElectraForSequenceClassification.from_pretrained('model')
model.to(device)

tokenizer = ElectraTokenizer.from_pretrained(
    "monologg/koelectra-small-v3-discriminator",
    do_lower_case=False
)

def featurize(comments):
    tokens_a = tokenizer.tokenize(comments)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
Ejemplo n.º 26
0
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

#학습에는 train만 사용함, test는 감점대상
train_dataset = NSMC_Dataset("ratings_train.txt","train")
test_dataset = NSMC_Dataset("ratings_test.txt","train")
sample_dataset = NSMC_Dataset("ko_data.csv","sample")

tmpstr = '훌륭하다. 초한지 얼른 읽어보고 다시 봐야겠다. 연출 훌륭하다 껄껄 한신의 토사구팽은 슬펐다'
print( train_dataset.clean_text( txt = tmpstr) )

"""# 모델 생성 (Create Model)"""

model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator").to(device)
model.cuda()

# 한번 실행해보기
#text, attention_mask, y = train_dataset[0]
#model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

try:
  model.load_state_dict(torch.load("model.pt"))
except:
  print("error - model.load_state_dict(torch.load('model.pt'))")
else:
  print("success - model.load_state_dict(torch.load('model.pt'))")

# 모델 레이어 보기
model
Ejemplo n.º 27
0
def train():
    # load model and tokenizer
    #MODEL_NAME = "bert-base-multilingual-cased"
    MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
    tokenizer = ElectraTokenizer.from_pretrained(MODEL_NAME)
    print(tokenizer.tokenize("이순신은 조선 중기의 무신이다."))
    print(tokenizer.tokenize("아버지가방에들어가신다."))
    tokenized_str = tokenizer.tokenize("이순신은 조선 중기의 무신이다." +
                                       tokenizer.sep_token + "아버지가방에들어가신다.")
    print(tokenized_str)

    # load dataset
    train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
    train_label = train_dataset['label'].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    train_dataset, dev_dataset = torch.utils.data.random_split(
        RE_train_dataset, [7000, 2001])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # setting model hyperparameter
    bert_config = ElectraConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = 42
    model = ElectraForSequenceClassification.from_pretrained(
        MODEL_NAME, config=bert_config)
    #model.parameters
    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        save_total_limit=4,  # number of total save model.
        load_best_model_at_end=True,
        save_steps=100,  # model saving step.
        num_train_epochs=10,  # total number of training epochs
        learning_rate=5e-5,  # learning_rate
        per_device_train_batch_size=8,  # batch size per device during training
        per_device_eval_batch_size=8,  # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=100,  # log saving step.
        evaluation_strategy=
        'steps',  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=100,  # evaluation step.
        dataloader_num_workers=3,
        label_smoothing_factor=0.5)
    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        eval_dataset=dev_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,  # define metrics function
    )

    # train model
    trainer.train()
Ejemplo n.º 28
0
from transformers import ElectraTokenizer, ElectraForSequenceClassification, pipeline
from pprint import pprint

tokenizer = ElectraTokenizer.from_pretrained(
    "monologg/koelectra-small-finetuned-nsmc")
model = ElectraForSequenceClassification.from_pretrained(
    "monologg/koelectra-small-finetuned-nsmc")

nsmc = pipeline("sentiment-analysis", tokenizer=tokenizer, model=model)

texts = [
    "이 영화는 미쳤다. 넷플릭스가 일상화된 시대에 극장이 존재해야하는 이유를 증명해준다.",
    "촬영감독의 영혼까지 갈아넣은 마스터피스",
    "보면서 화가날수있습니다.",
    "아니 그래서 무슨말이 하고싶은거야 ㅋㅋㅋ",
]

pprint(nsmc(texts))
Ejemplo n.º 29
0
from transformers import ElectraForSequenceClassification, ElectraTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = ElectraForSequenceClassification.from_pretrained(
    'models/ELECTRA_last_line')
tokenizer = ElectraTokenizerFast.from_pretrained(
    'google/electra-small-discriminator')


def tokenize(batch):
    return tokenizer(batch['text'],
                     truncation=True,
                     max_length=128,
                     add_special_tokens=True,
                     padding='max_length',
                     return_attention_mask=True)


test_dataset = load_dataset(
    'json',
    data_files={'test': 'dataset_last_line/quanta_test.json'},
    field='questions')['test']
test_dataset = test_dataset.map(
    lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]})
test_dataset = test_dataset.map(tokenize,
                                batched=True,
                                batch_size=len(test_dataset))
test_dataset.set_format('torch',
Ejemplo n.º 30
0
# tmpstr = 'Come on.  Hello?  I?m sorry you have the wrong number.   Okay, I?ll call you later dad. I love you.'
# print( train_dataset.clean_text( txt = tmpstr) )

# test_emotion = 'joy'
# if test_emotion in train_dataset.emotion_dic.keys() :
#   print( train_dataset.emotion_dic[test_emotion] )
# else :
#   print(0)

# print( train_dataset.__getitem__(11790) )

# print( sample_dataset.__getitem__(10) )

"""# 모델 생성 (Create Model)"""

model = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=8).to(device)
#model.cuda()

# 한번 실행해보기
#text, attention_mask, y = train_dataset[0]
#model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

try:
  model.load_state_dict(torch.load("model.pt"))
  #model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/model_daniel021_friends_electra_base_epoch4.pt"))
  #model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/model_daniel021_friends_electra_large_epoch8.pt"))
except:
  print("error - model.load_state_dict(torch.load(...))")
else:
  print("success - model.load_state_dict(torch.load(...))")