Esempio n. 1
0
def get_bpe_tokenizer(vocab, uppercase=False):
    return tokenizers.ByteLevelBPETokenizer(
        vocab=vocab,
        add_prefix_space=True,
        lowercase=not uppercase,
        trim_offsets=True,
    )
 def __init__(self, data_path, max_length=64, qa=False, model='bert'):
     super(InferDataset, self).__init__()
     assert model in ['bert', 'roberta']
     self.data_path = data_path
     self.id_list = list()
     self.text_list = list()
     self.label = list()
     self.max_length = max_length
     self.model = model
     self.label_map = {'neutral': 0, 'negative': 1, 'positive': 2}
     if self.model == 'bert':
         self.sent_id = {0: 8699, 1: 4997, 2: 3893}
         self.tokenizer = tokenizers.BertWordPieceTokenizer(
             '/home/liu/DL_workstation/tweet-sent/tweet-pytorch/tools/vocab.txt',
             lowercase=True)
     elif self.model == 'roberta':
         self.sent_id = {2: 1313, 1: 2430, 0: 7974}
         self.tokenizer = tokenizers.ByteLevelBPETokenizer(
             vocab_file=
             '/home/liu/DL_workstation/tweet-sent/tweet-pytorch/tools/vocab.json',
             merges_file=
             '/home/liu/DL_workstation/tweet-sent/tweet-pytorch/tools/merges.txt',
             lowercase=True,
             add_prefix_space=True)
     self.qa = qa
     if self.qa:
         print('\n\n Inferring with Question Answering Mode...')
     self.parse(self.data_path)
     self.invalid_cnt = 0
Esempio n. 3
0
 def create_tokenizer(self):
     tokenizer = tokenizers.ByteLevelBPETokenizer(
         vocab_file='/Users/ccw/PycharmProjects/Wilfred/tf-roberta/roberta-base-vocab.json',
         merges_file='/Users/ccw/PycharmProjects/Wilfred/tf-roberta/roberta-base-merges.txt',
         lowercase=True,
         add_prefix_space=True)
     return tokenizer
Esempio n. 4
0
 def __init__(self, df, max_len=96):
     self.df = df
     self.max_len = max_len
     self.labeled = 'selected_text' in df
     self.tokenizer = tokenizers.ByteLevelBPETokenizer(
         vocab_file=config.vocab_file, 
         merges_file=config.merges_file, 
         lowercase=True,
         add_prefix_space=True)
 def __init__(self, df, max_len=96):
     self.df = df
     self.max_len = max_len
     self.labeled = 'selected_text' in df
     self.tokenizer = tokenizers.ByteLevelBPETokenizer(
         vocab_file='roberta/vocab.json',
         merges_file='roberta/merges.txt',
         lowercase=True,
         add_prefix_space=True)
Esempio n. 6
0
 def get_tokenizer(self):
     tokenizer = tokenizers.ByteLevelBPETokenizer(
         vocab_file=self.config.data.roberta.path +
         self.config.data.roberta.vocab,
         merges_file=self.config.data.roberta.path +
         self.config.data.roberta.merges,
         lowercase=self.config.data.roberta.lowercase,
         add_prefix_space=self.config.data.roberta.add_prefix_space)
     return tokenizer
 def __init__(self, data_df, config):
     self.data_df, self.config = data_df, config
     self.maxlen = self.config['maxlen']
     self.labeled = 'selected_text' in data_df
     self.tokenizer = tokenizers.ByteLevelBPETokenizer(
         vocab_file=self.config['vocab_file_path'],
         merges_file=self.config['merge_file_path'],
         lowercase=True,
         add_prefix_space=True)
 def __init__(self, df, max_len=96, use_fifth=True):
     self.df = df
     self.max_len = max_len
     self.labeled = 'selected_text' in df
     self.tokenizer = tokenizers.ByteLevelBPETokenizer(
         vocab_file=ROOT_PATH + '/input/roberta-base/vocab.json',
         merges_file=ROOT_PATH + '/input/roberta-base/merges.txt',
         lowercase=True,
         add_prefix_space=True)
     self.use_fifth = use_fifth
Esempio n. 9
0
 def __init__(self, df, max_len=int(config['MODEL']['MAXLEN'])):
     self.df = df
     self.max_len = max_len
     self.labeled = 'selected_text' in df
     self.tokenizer = tokenizers.ByteLevelBPETokenizer(
         vocab_file=os.path.join(config['PATHS']['ROBERTA_PATH'],
                                 'vocab.json'),
         merges_file=os.path.join(config['PATHS']['ROBERTA_PATH'],
                                  'merges.txt'),
         lowercase=True,
         add_prefix_space=True)
Esempio n. 10
0
 def __init__(self, save_tokenizer_path: str, training_files,
              special_tokens, min_frequency: int, lowercase: bool,
              vocab_size: int):
     super(CustomTokenizerTrainer, self).__init__()
     self.save_tokenizer_path = save_tokenizer_path
     self.training_files = training_files
     self.special_tokens = special_tokens
     self.min_frequency = min_frequency
     self.lowercase = lowercase
     self.VOCAB_SIZE = vocab_size
     self.tokenizer = tokenizers.ByteLevelBPETokenizer(
         lowercase=self.lowercase)
Esempio n. 11
0
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 2
    EPOCH = 5
    BERT_PATH = './roberta_input'
    SAVE_PATH = './output'
    TOKENIZER = tokenizers.ByteLevelBPETokenizer(
        vocab_file=os.path.join(BERT_PATH, 'vocab.json'),
        merges_file=os.path.join(BERT_PATH, 'merges.txt'),
        lowercase=True,
        # add_prefix_space=True
    )
Esempio n. 12
0
def main(text_path, out_directory):
    Path(out_directory).mkdir(exist_ok=True, parents=True)

    english_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    german_tokenizer = tokenizers.ByteLevelBPETokenizer()

    german_tokenizer.train(
        [text_path],
        vocab_size=english_tokenizer.vocab_size,
        special_tokens=["<|endoftext|>"],
        show_progress=True,
    )
    german_tokenizer.save_model(out_directory)
class config:
    TRAINING_FILE = "../input/twe-myfolds/train_folds_20200425.csv"
    TRAIN_BATCH_SIZE = 96
    VALID_BATCH_SIZE = 96
    MAX_LEN = 128
    EPOCHS = 6
    conf_file = '/kaggle/input/all-weights/config.json'
    MODEL_PATH = "/kaggle/input/all-weights/"
    TOKENIZER = tokenizers.ByteLevelBPETokenizer(
        vocab_file=f"{MODEL_PATH}/vocab.json",
        merges_file=f"/kaggle/input/tweeter-offline-eval/merges-1.txt",
        lowercase=True,
        add_prefix_space=True)
Esempio n. 14
0
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 32
    VALID_BATCH_SIZE = 8
    EPOCHS = 5
    ROBERTA_PATH = "../input/roberta-base"
    MODEL_PATH = "model.bin"
    TRAINING_FILE = "../input/tweet-train-folds-v2/train_folds.csv"
    TOKENIZER = tokenizers.ByteLevelBPETokenizer(
        vocab_file=f"{ROBERTA_PATH}/vocab.json", 
        merges_file=f"{ROBERTA_PATH}/merges.txt", 
        lowercase=True,
        add_prefix_space=True
    )
 def __init__(self, df, max_len=128):
     # Dataframe dữ liệu
     self.df = df
     # độ dài tối đa của câu
     self.max_len = max_len
     # Nhãn
     self.labeled = 'selected_text' in df
     # Khởi tạo mã hóa BPE
     self.tokenizer = tokenizers.ByteLevelBPETokenizer(
         vocab_file='./roberta.base.torch/vocab.json',
         merges_file='./roberta.base.torch/merges.txt',
         lowercase=True,
         add_prefix_space=True
     )
 def __init__(self):
     self.MAX_LEN = 192
     self.TRAIN_BATCH_SIZE = 32
     self.VALID_BATCH_SIZE = 8
     self.EPOCHS = 5
     self.WEIGHTS_DIR = 'weights'
     self.BERT_PATH = "/data/tweet-sentiment-extraction/roberta-base"
     self.MODEL_PATH = "model.bin"
     self.TRAINING_FILE = "/data/tweet-sentiment-extraction/train_folds.csv"
     self.TEST_FILE = "/data/tweet-sentiment-extraction/test.csv"
     self.TRAIN_FILE = "/data/tweet-sentiment-extraction/train.csv"
     self.SAMPLE_FILE = "/data/tweet-sentiment-extraction/sample_submission.csv"
     self.TOKENIZER = tokenizers.ByteLevelBPETokenizer(
         vocab_file=f"{self.BERT_PATH}/vocab.json",
         merges_file=f"{self.BERT_PATH}/merges.txt",
         lowercase=True,
         add_prefix_space=True)
Esempio n. 17
0
    def get_tokenizer(self, model_path):
        tokenizer = None
        if 'roberta' in self.transformer_type:
            tokenizer = tokenizers.ByteLevelBPETokenizer(
                vocab_file=model_path+'vocab.json',
                merges_file=model_path+'merges.txt',
                lowercase=True,
                add_prefix_space=True)
        elif 'bert' in self.transformer_type:
            vocab_path = os.path.join(model_path, 'vocab.txt')
            tokenizer = tokenizers.BertWordPieceTokenizer(
                vocab_path,
                lowercase=True
            )
        else:
            raise RuntimeError(f'{self.transformer_type} is not supported')

        return tokenizer
Esempio n. 18
0
def main():
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    tokenizer.save_vocabulary('.')

    MAX_LEN = 96
    tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab_file='vocab.json',
            merges_file='merges.txt',
            lowercase=True,
            add_prefix_space=True
            )
    sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

    twitter_train = pd.read_csv('./kaggle/input/tweet-sentiment-extraction/train.csv', delimiter=',')
    twitter_test = pd.read_csv('./kaggle/input/tweet-sentiment-extraction/test.csv', delimiter=',')
    twitter_train = twitter_train.dropna()

    sentimentExtract = Sentiment(twitter_train[0:21984])
    sentimentExtract.train()
    def __init__(self, df, max_len=96, 
                 vocab_file='../input/roberta-base/vocab.json',
                 merges_file='../input/roberta-base/merges.txt',
                 change_sentiment_p=0.0,
                 premake_dataset=False,
                 ):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab_file=vocab_file, 
            merges_file=merges_file, 
            lowercase=True,
            add_prefix_space=True)
        self.change_sentiment_p = change_sentiment_p
        self.uniq_sentiment = np.unique(self.df['sentiment'].values)

        self.premake_dataset = premake_dataset
        if self.premake_dataset:
            self.dataset = [self.make_data(i) for i in range(len(self.df))]
class config:
    LEARNING_RATE = 4e-5
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 50
    VALID_BATCH_SIZE = 32
    EPOCHS = 3
    INPUT_PATH = "/content/drive/My Drive/Tweet Sentiment Extraction/input/"
    TRAINING_FILE = f"{INPUT_PATH}tweet-sentiment-extraction/train_8folds.csv"
    ROBERTA_PATH = f"{INPUT_PATH}roberta-base/"
    TOKENIZER_N = transformers.RobertaTokenizer(
        vocab_file =  f'{ROBERTA_PATH}vocab.json',
        merges_file = f'{ROBERTA_PATH}merges.txt',
        lowercase = True,
        add_prefix_space = True
    )
    # OLD
    TOKENIZER = tokenizers.ByteLevelBPETokenizer(
        vocab_file=f"{ROBERTA_PATH}/vocab.json", 
        merges_file=f"{ROBERTA_PATH}/merges.txt", 
        lowercase=True,
        add_prefix_space=True
    )
Esempio n. 21
0
    def __init__(
        self,
        args,
        dataset,
        source_dictionary,
        dropout=0.1,
        seed=1,
    ):
        super().__init__(dataset)
        self.source_dictionary = source_dictionary
        self.epoch = 0
        self.seed = seed
        self.dropout = dropout

        import tokenizers

        self.hf_tokenizer = tokenizers.ByteLevelBPETokenizer(
            args.gpt2_encoder_json,
            args.gpt2_vocab_bpe,
            add_prefix_space=True,
            dropout=self.dropout,
        )
def custom_bpe_tokenizer(corpus,
                         text_filepath,
                         tokenizer_save_path,
                         vocab_size=10000,
                         min_frequency=3):
    if type(corpus[0]) == list:
        corpus = [" ".join(i) for i in corpus]

    try:
        os.makedirs(text_filepath)
    except OSError:
        pass

    tokenizer = tokenizers.ByteLevelBPETokenizer(
        vocab_file=None,
        merges_file=None,
    )  #SentencePieceBPETokenizer()

    df = pd.DataFrame()
    df['text'] = corpus
    df.to_csv(os.path.join(text_filepath, 'file.txt'),
              header=False,
              index=False)

    try:
        os.makedirs(tokenizer_save_path)
    except OSError:
        pass

    tokenizer.train(
        files=os.path.join(text_filepath, 'file.txt'),
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=['[PAD]', '[UNK]', '[CLS]', '[MASK]', '[SEP]'])

    tokenizer.save(directory=tokenizer_save_path, name='bpe')

    os.remove(os.path.join(text_filepath, 'file.txt'))
Esempio n. 23
0
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

import tokenizers

DIR = "./inputs/datasets/tkm/"

MAX_LEN = 120
# PATH = '../input/tf-roberta/'
PATH = './inputs/datasets/roberta/tokenizer/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    # vocab_file=PATH + 'vocab-roberta-base.json',
    # merges_file=PATH + 'merges-roberta-base.txt',
    vocab_file=PATH + 'vocab.json',
    merges_file=PATH + 'merges.txt',
    lowercase=True,
    add_prefix_space=True)
SEED = 88888
np.random.seed(SEED)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}


def proc(train):
    # 前処理
    ct = train.shape[0]
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
    token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32')
    start_tokens = np.zeros((ct, MAX_LEN), dtype='int32')
Esempio n. 24
0
def run(args):

    train_df = load_data.load_custom_text_as_pd(args.train_data,sep='\t',header=True, \
                              text_column=['Text'],target_column=['Label'])
    val_df = load_data.load_custom_text_as_pd(args.val_data,sep='\t', header=True, \
                          text_column=['Text'],target_column=['Label'])

    train_df = pd.DataFrame(train_df, copy=False)
    val_df = pd.DataFrame(val_df, copy=False)

    model_save_dir = args.model_save_path
    try:
        os.makedirs(model_save_dir)
    except OSError:
        pass

    train_df.labels, label2idx = data_utils.convert_categorical_label_to_int(train_df.labels, \
                                                             save_path=os.path.join(model_save_dir,'label2idx.pkl'))

    val_df.labels, _ = data_utils.convert_categorical_label_to_int(val_df.labels, \
                                                             save_path=os.path.join(model_save_dir,'label2idx.pkl'))

    if args.berttweettokenizer_path:
        tokenizer = BERTweetTokenizer(args.berttweettokenizer_path)
    else:
        tokenizer = AutoTokenizer.from_pretrained(
            args.transformer_model_pretrained_path)

    if not args.berttweettokenizer_path:
        bpetokenizer = tokenizers.ByteLevelBPETokenizer(args.bpe_vocab_path, \
                                                args.bpe_merges_path)
    else:
        bpetokenizer = None

    if bpetokenizer:
        train_corpus = data_utils.Corpus(train_df.copy(),
                                         tokenizer=bpetokenizer.encode)
        val_corpus = data_utils.Corpus(val_df.copy(),
                                       tokenizer=bpetokenizer.encode)
    else:
        train_corpus = data_utils.Corpus(train_df.copy(),
                                         tokenizer=tokenizer.tokenize)
        val_corpus = data_utils.Corpus(val_df.copy(),
                                       tokenizer=tokenizer.tokenize)

    train_dataset = data_utils.TransformerDataset(train_corpus.data.words, bpetokenizer=bpetokenizer, tokenizer=tokenizer, MAX_LEN=args.max_text_len, \
                  target_label=train_corpus.data.labels, sequence_target=False, target_text=None, conditional_label=None, conditional_all_labels=None)

    val_dataset = data_utils.TransformerDataset(val_corpus.data.words, bpetokenizer=bpetokenizer, tokenizer=tokenizer, MAX_LEN=args.max_text_len, \
                  target_label=val_corpus.data.labels, sequence_target=False, target_text=None, conditional_label=None, conditional_all_labels=None)

    if _torch_tpu_available and args.use_TPU:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=True)

        val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=False)

    if _torch_tpu_available and args.use_TPU:
        train_data_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=args.train_batch_size,
            sampler=train_sampler,
            drop_last=True,
            num_workers=2)

        val_data_loader = torch.utils.data.DataLoader(
            val_dataset,
            batch_size=args.eval_batch_size,
            sampler=val_sampler,
            drop_last=False,
            num_workers=1)
    else:
        train_data_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=args.train_batch_size)

        val_data_loader = torch.utils.data.DataLoader(
            val_dataset, batch_size=args.eval_batch_size)

    config = AutoConfig.from_pretrained(args.transformer_config_path,
                                        output_hidden_states=True,
                                        output_attentions=True)
    basemodel = AutoModel.from_pretrained(
        args.transformer_model_pretrained_path, config=config)
    model = transformer_models.TransformerWithCLS(basemodel)

    if args.use_torch_trainer:
        device = torch.device(
            "cuda" if _torch_gpu_available and args.use_gpu else "cpu")
        if _torch_tpu_available and args.use_TPU:
            device = xm.xla_device()

        if args.use_TPU and _torch_tpu_available and args.num_tpus > 1:
            train_data_loader = torch_xla.distributed.parallel_loader.ParallelLoader(
                train_data_loader, [device])
            train_data_loader = train_data_loader.per_device_loader(device)

        trainer = BasicTrainer(model,
                               train_data_loader,
                               val_data_loader,
                               device,
                               args.transformer_model_pretrained_path,
                               test_data_loader=val_data_loader)

        param_optimizer = list(trainer.model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        num_train_steps = int(len(train_data_loader) * args.epochs)

        if _torch_tpu_available and args.use_TPU:
            optimizer = AdamW(optimizer_parameters,
                              lr=args.lr * xm.xrt_world_size())
        else:
            optimizer = AdamW(optimizer_parameters, lr=args.lr)

        if args.use_apex and _has_apex:
            model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

        def _mp_fn(rank, flags, trainer, epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus,  \
                    max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, seed):
            torch.set_default_tensor_type('torch.FloatTensor')
            a = trainer.train(epochs, lr, metric, loss_function, optimizer, scheduler, model_save_path, num_gpus, num_tpus,  \
                    max_grad_norm, early_stopping_rounds, snapshot_ensemble, is_amp, seed)

        FLAGS = {}
        if _torch_tpu_available and args.use_TPU:
            xmp.spawn(_mp_fn, args=(FLAGS, trainer, args.epochs, args.lr, args.metric, args.loss_function, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus, \
                     1, 3, False, args.use_apex, args.seed), nprocs=8, start_method='fork')
        else:
            trainer.train(args.epochs, args.lr, args.metric, args.loss_function, optimizer, scheduler, args.model_save_path, args.num_gpus, args.num_tpus,  \
                    max_grad_norm=1, early_stopping_rounds=3, snapshot_ensemble=False, is_amp=args.use_apex, seed=args.seed)

        test_output = trainer.test_output

    elif args.use_lightning_trainer and _torch_lightning_available:
        from pytorch_lightning import Trainer, seed_everything
        seed_everything(args.seed)

        log_args = {
            'description': args.transformer_model_pretrained_path,
            'loss': args.loss_function,
            'epochs': args.epochs,
            'learning_rate': args.lr
        }

        if _has_wandb and not _torch_tpu_available and args.wandb_logging:
            wandb.init(project="WNUT-Task-2", config=log_args)
            wandb_logger = WandbLogger()

        checkpoint_callback = ModelCheckpoint(filepath=args.model_save_path,
                                              save_top_k=1,
                                              verbose=True,
                                              monitor='val_loss',
                                              mode='min')
        earlystop = EarlyStopping(monitor='val_loss',
                                  patience=3,
                                  verbose=False,
                                  mode='min')

        if args.use_gpu and _torch_gpu_available:
            print("using GPU")
            if args.wandb_logging:
                if _has_apex:
                    trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, precision=16, \
                                checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
                else:
                    trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, logger=wandb_logger, \
                                checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                if _has_apex:
                    trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, precision=16, \
                                checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
                else:
                    trainer = Trainer(gpus=args.num_gpus, max_epochs=args.epochs, \
                                checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

        elif args.use_TPU and _torch_tpu_available:
            print("using TPU")
            if _has_apex:
                trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, precision=16, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                trainer = Trainer(num_tpu_cores=args.num_tpus, max_epochs=args.epochs, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])

        else:
            print("using CPU")
            if args.wandb_logging:
                if _has_apex:
                    trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, precision=16, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
                else:
                    trainer = Trainer(max_epochs=args.epochs, logger=wandb_logger, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
            else:
                if _has_apex:
                    trainer = Trainer(max_epochs=args.epochs, precision=16, \
                            checkpoint_callback=checkpoint_callback, callbacks=[earlystop])
                else:
                    trainer = Trainer(max_epochs=args.epochs,
                                      checkpoint_callback=checkpoint_callback,
                                      callbacks=[earlystop])

        num_train_steps = int(len(train_data_loader) * args.epochs)

        pltrainer = PLTrainer(num_train_steps,
                              model,
                              args.metric,
                              args.loss_function,
                              args.lr,
                              seed=42)

        #try:
        #    print ("Loaded model from previous checkpoint")
        #    pltrainer = PLTrainer.load_from_checkpoint(args.model_save_path)
        #except:
        #    pass

        trainer.fit(pltrainer, train_data_loader, val_data_loader)

        test_output = (pltrainer, val_data_loader)

    idx2label = {value: key for key, value in label2idx.items()}

    test_output = [idx2label[i] for i in test_output]

    return test_output
import tokenizers

DEVICE = 'cpu'  #cuda
MAX_LEN = 128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 6
BERT_PATH = "../pretrained_models/roberta-base/"
MODEL_PATH = "pytorch_model.bin"
TRAINING_FILE = "../data/train.csv"
TEST_FILE = "../data/test.csv"
TOKENIZER = tokenizers.ByteLevelBPETokenizer(
    vocab_file="../pretrained_models/roberta-base/vocab.json",
    merges_file="../pretrained_models/roberta-base/merges.txt",
    lowercase=True,
    add_prefix_space=True)
Esempio n. 26
0
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True


seed = 42
seed_everything(seed)
data_save = {}
# 其他地方会再次用到分词器,是否可缓存,没尝试过
bptokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file='roberta/vocab.json',
    merges_file='roberta/merges.txt',
    lowercase=True,
    add_prefix_space=True)


# fw = open("testfin.txt", "w", encoding='utf-8')
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=96):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        self.tokenizer = tokenizers.ByteLevelBPETokenizer(
            vocab_file='roberta/vocab.json',
            merges_file='roberta/merges.txt',
            lowercase=True,
            add_prefix_space=True)
Esempio n. 27
0
# Reading the data from the csv files
train_data = pd.read_csv(training_path)
test_data = pd.read_csv(test_path)

sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

# Preprocessing the data
import os

TOKENIZE_PATH = './RoBERTA Files/'
MAX_LEN = 100

# Initializing thr token variable for tokenization
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=TOKENIZE_PATH + 'vocab.json',
    merges_file=TOKENIZE_PATH + 'merges.txt',
    lowercase=True,
    add_prefix_space=True)

train_data.dropna(axis=0)
# For bert model we need to tokenize the data as per our needs
# For tokenizing the data we are using a pretrained tokenizer from the Roberta Hugging Face

# Assuming to be the mazimum length of the tweet be 100 words
MAX_LEN = 100
instances = train_data.shape[0]

# Inititalizing the tokenization arrays
input_ids = np.ones((instances, MAX_LEN), dtype='int32')
attention_mask = np.zeros((instances, MAX_LEN), dtype='int32')
token_type_ids = np.zeros((instances, MAX_LEN), dtype='int32')
Esempio n. 28
0
    import gc
    gc.collect()
    torch.cuda.empty_cache()

# max_len = 160
# train_batch_size = 16
# valid_batch_size = 8
# epochs = 3

# roberta_path = "./roberta-base"
roberta_path = "./roberta-base"
training_file = "./train-kfolds/train_5folds.csv"

tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file = "./roberta-base/vocab.json",
    merges_file = "./roberta-base/merges.txt",
    lowercase = True,
    add_prefix_space = True
)

# train(0, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path)
# train(1, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path)
# train(2, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path)
# train(3, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path)
# train(4, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path)
print("fold: ", args.fold)
train(args.fold, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, args.lr, args.patience, args.num_warmup_steps)
#train(1, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, args.lr, args.patience, args.num_warmup_steps)
#train(2, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, args.lr, args.patience, args.num_warmup_steps)
#train(3, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, args.lr, args.patience, args.num_warmup_steps)
#train(4, epochs, training_file, tokenizer, max_len, train_batch_size, valid_batch_size, roberta_path, args.lr, args.patience, args.num_warmup_steps)
Esempio n. 29
0
import os
import tokenizers

MAX_LEN = 192
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 10
SEED = 43
ROBERTA_PATH = '/home/mikhail/workspace/roberta-base'
MODEL_PATH = 'model.bin'
TRAINING_FILE_WITHOUT_FOLDS = '../input/train.csv'
TRAINING_FILE = '../input/train_folds.csv'
VALID_FILE = '../input/valid.csv'
TOKENIZER = tokenizers.ByteLevelBPETokenizer(
    vocab_file=f"{ROBERTA_PATH}/vocab.json", 
    merges_file=f"{ROBERTA_PATH}/merges.txt", 
    lowercase=True,
    add_prefix_space=True
)
Esempio n. 30
0
    print('model_path: %s' % model_path)
    """ model config 
    """

    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)
    sub_df = pd.read_csv(sub_file)
    train_df.dropna(inplace=True)
    train_df = train_df.reset_index(drop=True)
    print(train_df.shape, test_df.shape, sub_df.shape)
    """ load data
    """

    tokenizer = tokenizers.ByteLevelBPETokenizer(
        vocab_file=os.path.join(roberta_path, 'vocab.json'),
        merges_file=os.path.join(roberta_path, 'merges.txt'),
        lowercase=True,
        add_prefix_space=True)
    roberta_config = transformers.RobertaConfig.from_pretrained(roberta_path)
    roberta_config.output_hidden_states = True
    sentiment_d = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
    """ roberta config
    """
    """ training
    """
    n_splits = 5
    max_epochs = 5
    initial_lr = 3e-5
    is_gpu = torch.cuda.is_available()
    device = torch.device('cuda' if is_gpu else 'cpu')