Exemple #1
0
async def setup_learner():
    await download_file(pretrained_link, path / modelname),
    #await download_file(vocablink, path / vocab),
    #await download_file(sptokenlink, path / sptoken),
    #await download_file(tokenlink, path / token),
    #await download_file(configlink, path / config),
    #await download_file(l2link, path / l2)
    try:
        data_bunch = BertDataBunch(path,
                                   path,
                                   tokenizer=path,
                                   train_file=None,
                                   val_file=None,
                                   label_file='l2.csv',
                                   batch_size_per_gpu=120,
                                   max_seq_length=40,
                                   multi_gpu=False,
                                   multi_label=False,
                                   model_type='bert')

        learner = BertLearner.from_pretrained_model(data_bunch,
                                                    pretrained_path=path,
                                                    metrics=[],
                                                    device='cpu',
                                                    logger=None,
                                                    output_dir=None,
                                                    is_fp16=False)
        return learner
    except RuntimeError as e:
        if len(e.args) > 0 and 'CPU-only machine' in e.args[0]:
            print(e)
            message = "\n\nThis model was trained with an old version of fastai and will not work in a CPU environment.\n\nPlease update the fastai library in your training environment and export your model again.\n\nSee instructions for 'Returning to work' at https://course.fast.ai."
            raise RuntimeError(message)
        else:
            raise
def train(path_to_directory, model):

    DATA_PATH = BASE / path_to_directory
    OUTPUT_DIR = DATA_PATH / 'output' / model
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    if (model == "biobert"):
        tokenizer = BertTokenizer.from_pretrained(BIOBERT_PATH,
                                                  do_lower_case=True)
        pretrained_path = BIOBERT_PATH
    elif (model == "bert"):
        tokenizer = "bert-base-uncased"
        pretrained_path = "bert-base-uncased"
    else:
        print("Model parameter must be either 'bert' or 'biobert'")
        return

    databunch = BertDataBunch(DATA_PATH,
                              LABEL_PATH,
                              tokenizer=tokenizer,
                              train_file='train.csv',
                              val_file='val.csv',
                              text_col='text',
                              label_file='labels.csv',
                              label_col=labels,
                              batch_size_per_gpu=10,
                              max_seq_length=512,
                              multi_gpu=multi_gpu,
                              multi_label=True,
                              model_type='bert',
                              clear_cache=True)

    learner = BertLearner.from_pretrained_model(
        databunch,
        pretrained_path=pretrained_path,
        metrics=metrics,
        device=device_cuda,
        logger=logger,
        output_dir=OUTPUT_DIR,
        finetuned_wgts_path=None,
        warmup_steps=500,
        multi_gpu=multi_gpu,
        is_fp16=True,
        multi_label=True,
        logging_steps=20)

    if path_to_directory.split('/', 1)[1] in ['original', 'synthetic']:
        epochs = 20
    else:
        epochs = 10

    learner.fit(
        epochs=epochs,
        lr=6e-5,
        validate=True,  # Evaluate the model after each epoch
        schedule_type="warmup_cosine")

    learner.save_model()

    return
Exemple #3
0
    def __init__(self):
        databunch = BertDataBunch('train',
                                  'train',
                                  tokenizer='distilbert-base-uncased',
                                  train_file='train.csv',
                                  val_file='val.csv',
                                  label_file='labels.csv',
                                  text_col='text',
                                  label_col='label',
                                  batch_size_per_gpu=8,
                                  max_seq_length=512,
                                  multi_gpu=False,
                                  multi_label=False,
                                  model_type='distilbert')

        device_cuda = torch.device("cuda")
        metrics = [{'name': 'accuracy', 'function': accuracy}]
        logger = logging.getLogger()

        self.learner = BertLearner.from_pretrained_model(databunch,
                                                         pretrained_path='distilbert-base-uncased',
                                                         metrics=metrics,
                                                         device=device_cuda,
                                                         output_dir='models',
                                                         warmup_steps=100,
                                                         logger=logger,
                                                         multi_gpu=False,
                                                         is_fp16=False,  # install apex to use fp16 training
                                                         multi_label=False,
                                                         logging_steps=0)
 def train(self):
     databunch = BertDataBunch(
         self._args.data_dir,
         self._args.data_dir,
         tokenizer=self._model_path,
         train_file=self._args.train_file,
         val_file=self._args.eval_file,
         label_file=self._args.labels_file,
         text_col=self._args.text_col,
         label_col=self._args.label_col,
         batch_size_per_gpu=self._args.batch_size_per_gpu,
         max_seq_length=self._args.max_seq_length,
         multi_gpu=True,
         multi_label=self._args.multi_label,
         model_type='bert')
     device = torch.device('cuda')
     learner = BertLearner.from_pretrained_model(
         databunch,
         self._model_path,
         metrics=self.metrics(),
         device=device,
         logger=log,
         output_dir=self._output_dir,
         finetuned_wgts_path=None,
         warmup_steps=5,
         multi_gpu=True,
         is_fp16=self._args.fp16,
         multi_label=self._args.multi_label,
         logging_steps=0)
     learner.fit(self._args.num_train_epochs,
                 self._args.learning_rate,
                 validate=True)
     learner.validate()
def train(args):
	if args.is_onepanel:
		args.out_dir = os.path.join("/onepanel/output/",args.out_dir)
	if not os.path.exists(args.out_dir):
		os.mkdir(args.out_dir)

	logger = logging.getLogger()
	labels = ["anger", "anticipation","disgust","fear","joy","love","optimism","pessimism","sadness","surprise","trust","neutral"]
	databunch = BertDataBunch(".", ".",
								tokenizer=args.pretrained_model,
								train_file='nlp_train.csv',
								label_file='labels.csv',
								val_file="nlp_valid.csv",
								text_col='text',
								label_col=labels,
								batch_size_per_gpu=args.batch_size,
								max_seq_length=512,
								multi_gpu=False,
								multi_label=True,
								model_type='bert')

	device_cuda = torch.device("cuda")
	metrics = [{'name': 'accuracy', 'function': accuracy}]

	learner = BertLearner.from_pretrained_model(
							databunch,
							pretrained_path=args.pretrained_model,
							metrics=metrics,
							device=device_cuda,
							logger=logger,
							output_dir=args.out_dir,
							finetuned_wgts_path=None,
							warmup_steps=200,
							multi_gpu=False,
							is_fp16=False,
							multi_label=True,
							logging_steps=10)

	learner.fit(epochs=args.epochs,
				lr=2e-3,
				schedule_type="warmup_cosine_hard_restarts",
				optimizer_type="lamb")
				# validate=True)
	learner.save_model()
Exemple #6
0
                          model_type='bert')

device_cuda = torch.device('cuda')

metrics = []
metrics.append({'name': 'accuracy', 'function': accuracy})
metrics.append({'name': 'F1_macro', 'function': F1_macro})
metrics.append({'name': 'F1_micro', 'function': F1_micro})

learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path='bert-base-german-dbmdz-cased',
    metrics=metrics,
    device=device_cuda,
    logger=logger,
    output_dir='./output/',
    finetuned_wgts_path=None,
    warmup_steps=500,
    multi_gpu=False,
    is_fp16=True,
    multi_label=False,
    logging_steps=500)

learner.fit(
    epochs=3,
    lr=6e-4,
    validate=True,  # Evaluate the model after each epoch
    schedule_type="warmup_cosine",
    optimizer_type="lamb")

learner.validate()
Exemple #7
0
                          multi_label=False,
                          model_type='bert')

# Choose the metrics used for the error function in training
metrics = []
metrics.append({'name': 'accuracy', 'function': accuracy})

import logging

logger = logging.getLogger()
OUTPUT_DIR = "/content/drive/My Drive/Colab Notebooks/output"

# The learner contains the logic for training loop, validation loop,
# optimiser strategies and key metrics calculation
learner = BertLearner.from_pretrained_model(databunch,
                                            bert_model,
                                            metrics=metrics,
                                            device=device,
                                            logger=logger,
                                            output_dir=OUTPUT_DIR,
                                            finetuned_wgts_path=None,
                                            is_fp16=args['fp16'],
                                            loss_scale=args['loss_scale'],
                                            multi_gpu=multi_gpu,
                                            multi_label=False)

# Train the model
learner.fit(6, lr=args['learning_rate'], schedule_type="warmup_cosine")

# Save the model into a file
learner.save_and_reload(MODEL_PATH, "trained_model_name")
Exemple #8
0
    train_file='train.csv',
    val_file='valid.csv',  # val.csv
    label_file='labels.csv',
    text_col='content',
    label_col=label_cols,
    batch_size_per_gpu=2,
    max_seq_length=512,
    multi_gpu=True,
    multi_label=True,
    model_type='bert')

from fast_bert.metrics import accuracy_multilabel
from fast_bert.learner_cls import BertLearner
metrics = [{'name': 'accuracy', 'function': accuracy_multilabel}]
learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path='bert-base-uncased',
    metrics=metrics,
    device=device_cuda,
    logger=logger,
    output_dir=r"./output_dir",
    is_fp16=True,
    multi_gpu=True,
    multi_label=True)

learner.fit(6,
            lr=6e-5,
            validate=True,
            schedule_type="warmup_linear",
            optimizer_type="lamb")
learner.save_model()
Exemple #9
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="Path specifying the location of the dataset")

    parser.add_argument(
        "--label_dir",
        default=None,
        type=str,
        required=True,
        help="Path specifying the location of the labels.csv file")

    parser.add_argument(
        "--output_dir",
        default=None,
        required=True,
        type=str,
        help="Path specifying the location to save the results")

    parser.add_argument("--text_col",
                        default=None,
                        required=True,
                        type=str,
                        help="The column name of the text")

    parser.add_argument("--batch_size",
                        default=16,
                        required=False,
                        type=int,
                        help="Batch size per GPU")

    parser.add_argument(
        "--max_seq_len",
        default=320,
        required=False,
        type=int,
        help="Maximum length of the token sequence to input to BERT")

    parser.add_argument("--multi_gpu",
                        default=False,
                        required=False,
                        type=bool,
                        help="Whether to use multi-gpu for training")

    parser.add_argument("--epochs",
                        default=6,
                        type=int,
                        required=False,
                        help="Number of epochs to train")

    parser.add_argument("--lr",
                        default=6e-5,
                        type=float,
                        required=False,
                        help="Initial learning rate for training")

    parser.add_argument("--save_model",
                        required=False,
                        default=None,
                        help="Whether to save the model or not")

    parser.add_argument("--eval",
                        required=False,
                        type=bool,
                        default=True,
                        help="Whether to run evaluation after each epoch")

    args = parser.parse_args()

    DATA_PATH = args.data_dir

    LABEL_PATH = args.label_dir

    OUTPUT_PATH = args.output_dir

    EPOCHS = args.epochs

    LR = args.lr

    EVAL = args.eval

    TEXT_COL = args.text_col

    BATCH_SIZE = args.batch_size

    MAX_SEQ_LEN = args.max_seq_len

    MULTI_GPU = args.multi_gpu

    labels = pd.read_csv(os.path.join(DATA_PATH, 'labels.csv'),
                         header=None).values

    LABEL_LIST = [val[0] for val in labels]

    databunch = BertDataBunch(DATA_PATH,
                              LABEL_PATH,
                              tokenizer='bert-base-uncased',
                              train_file='m_aspect_train.csv',
                              val_file='m_aspect_test.csv',
                              label_file='labels.csv',
                              text_col=TEXT_COL,
                              label_col=LABEL_LIST,
                              batch_size_per_gpu=BATCH_SIZE,
                              max_seq_length=MAX_SEQ_LEN,
                              multi_gpu=MULTI_GPU,
                              multi_label=True,
                              model_type='bert',
                              no_cache=True)

    # display(databunch.get_dl_from_texts)

    device_cuda = torch.device("cuda")
    metrics = [{'name': 'accuracy', 'function': accuracy_multilabel}]

    learner = BertLearner.from_pretrained_model(
        databunch,
        pretrained_path='bert-base-uncased',
        metrics=metrics,
        device=device_cuda,
        logger=None,
        output_dir=OUTPUT_PATH,
        finetuned_wgts_path=None,
        warmup_steps=500,
        multi_gpu=MULTI_GPU,
        is_fp16=False,
        multi_label=True,
        logging_steps=50)

    global_step, loss = learner.fit(
        epochs=EPOCHS,
        lr=LR,
        validate=EVAL,  # Evaluate the model after each epoch
        schedule_type="warmup_cosine",
        optimizer_type="lamb")

    print("global_Step:", global_step, "loss:", loss)

    if args.save_model:
        learner.save_model()
Exemple #10
0
                          batch_size_per_gpu=64,
                          max_seq_length=50,
                          multi_gpu=False,
                          multi_label=False,
                          model_type='roberta')

logger = logging.getLogger()
device_cuda = torch.device("cuda")
metrics = [{'name': 'accuracy', 'function': accuracy}]

learner = BertLearner.from_pretrained_model(databunch,
                                            pretrained_path='roberta-base',
                                            metrics=metrics,
                                            device=device_cuda,
                                            logger=logger,
                                            output_dir=OUTPUT_DIR,
                                            finetuned_wgts_path=None,
                                            warmup_steps=500,
                                            multi_gpu=False,
                                            is_fp16=False,
                                            multi_label=False,
                                            logging_steps=4000)

#learner.lr_find(start_lr=1e-5,optimizer_type='lamb')

learner.fit(
    epochs=3,
    lr=6e-5,
    validate=True,  # Evaluate the model after each epoch
    schedule_type="warmup_cosine",
    optimizer_type="lamb")
                          max_seq_length=15,
                          multi_gpu=False,
                          multi_label=False,
                          model_type='bert')

OUTPUT_DIR = '../../bert_model/'
logger = logging.getLogger()
device_cuda = torch.device("cuda")
metrics = [{'name': 'accuracy', 'function': accuracy}]

learner = BertLearner.from_pretrained_model(databunch,
                                            pretrained_path=PRETRAINED_PATH,
                                            metrics=metrics,
                                            device=device_cuda,
                                            logger=logger,
                                            output_dir=OUTPUT_DIR,
                                            finetuned_wgts_path=None,
                                            warmup_steps=10000,
                                            multi_gpu=False,
                                            is_fp16=True,
                                            multi_label=False,
                                            logging_steps=0)

for i in range(3):
    try:
        learner.fit(
            epochs=1,
            lr=3e-4,
            validate=True,  # Evaluate the model after each epoch
            schedule_type="warmup_cosine",
            optimizer_type="lamb")
Exemple #12
0
import torch
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy
import logging

logger = logging.getLogger()
device_cuda = torch.device("cuda")
metrics = [{'name': 'accuracy', 'function': accuracy}]
OUTPUT_DIR = '/hdd/user4/xlnet_classfication3/output'
learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path='xlnet-base-cased',
    metrics=metrics,
    device=device_cuda,
    logger=logger,
    output_dir=OUTPUT_DIR,
    finetuned_wgts_path=None,
    # finetuned_wgts_path = '/hdd/user4/xlnet_classification3/output/model_out/pytorch_model.bin',
    warmup_steps=500,
    multi_gpu=False,
    is_fp16=False,
    multi_label=False,
    logging_steps=250)

learner.fit(
    epochs=10000,
    lr=6e-4,  #default = 6e-5
    validate=True,  # Evaluate the model after each epoch
    schedule_type="warmup_cosine",
    optimizer_type="lamb")

# learner.save_model()
Exemple #13
0
    multi_gpu=multi_gpu,
    multi_label=False,
    model_type='bert')

metrics = []
metrics.append({'name': 'accuracy', 'function': accuracy})
metrics.append({'name': 'roc_auc', 'function': roc_auc_2})
metrics.append({'name': 'fbeta', 'function': fbeta_2})

learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path='bert-base-uncased',
    metrics=metrics,
    device=device,
    logger=logger,
    output_dir=OUTPUT_PATH,
    finetuned_wgts_path=None,
    warmup_steps=500,
    multi_gpu=multi_gpu,
    is_fp16=False,
    multi_label=False,
    logging_steps=-1)

learner.fit(
    epochs=4,
    lr=6e-5,
    validate=True,  # Evaluate the model after each epoch
    schedule_type="warmup_cosine",
    optimizer_type="lamb")

learner.save_model()
Exemple #14
0
metrics.append({'name': 'roc_auc', 'function': roc_auc})
# metrics.append({'name': 'roc_auc_save_to_plot', 'function': roc_auc_save_to_plot})
metrics.append({'name': 'fbeta', 'function': fbeta})
metrics.append({'name': 'accuracy', 'function': accuracy})
metrics.append({
    'name': 'accuracy_multilabel',
    'function': accuracy_multilabel
})

learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path='/scratch/da2734/twitter/mturk_mar6/output_100/model_out/',
    metrics=metrics,
    device=device,
    logger=logger,
    output_dir=args.output_dir,
    finetuned_wgts_path=FINETUNED_PATH,
    warmup_steps=args.warmup_steps,
    multi_gpu=args.multi_gpu,
    is_fp16=args.fp16,
    multi_label=True,
    logging_steps=0)

print('time taken to load all this stuff:', str(time.time() - start_time),
      'seconds')

# In[ ]:

import time
import pyarrow.parquet as pq
from glob import glob
    label_col="label",
    batch_size_per_gpu=32,
    max_seq_length=128,
    multi_gpu=False,
    multi_label=False,
    model_type="bert",
)

learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path=pretrained_path,
    metrics=metrics,
    device=device_cuda,
    logger=logger,
    output_dir=out_path,
    finetuned_wgts_path=None,
    warmup_steps=200,
    multi_gpu=False,
    is_fp16=True,
    fp16_opt_level="O2",
    multi_label=False,
    logging_steps=100,
)

learner.fit(
    epochs=epochs,
    lr=lr,
    validate=True,
    schedule_type="warmup_cosine",
    optimizer_type="lamb",
)
Exemple #16
0
def train_bert(experiment_parameters, args):

    # logging
    run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')
    logfile = str(experiment_parameters.LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        handlers=[
            logging.FileHandler(logfile),
            logging.StreamHandler(sys.stdout)
        ])

    logger = logging.getLogger()

    # cuda
    device = torch.device('cuda')
    if torch.cuda.device_count() > 1:
        args.multi_gpu = True
    else:
        args.multi_gpu = False

    print()
    print('BERT training file: ',args['data_dir'],'train.csv')

    # create a fast-bert-specific data format
    torch.manual_seed(args.seed)
    databunch = BertDataBunch(args['data_dir'], experiment_parameters.LABEL_PATH,
                              experiment_parameters.tokenizer,
                              train_file='train.csv',
                              val_file=None,#'test.csv',
                              test_data='test.csv',
                              text_col="comment_text", label_col=experiment_parameters.LABEL_COLS,
                              batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'],
                              multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type, clear_cache=False)

    metrics = []
    metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
    metrics.append({'name': 'roc_auc', 'function': roc_auc})
    metrics.append({'name': 'fbeta', 'function': fbeta})

    # create learner object
    learner = BertLearner.from_pretrained_model(databunch, args.model_name, metrics=metrics,
                                            device=device, logger=logger, output_dir=args.output_dir,
                                            finetuned_wgts_path=experiment_parameters.FINETUNED_PATH,
                                            warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu, is_fp16=args.fp16,
                                            multi_label=True, logging_steps=0)

    # train
    torch.manual_seed(args.seed)
    learner.fit(args.num_train_epochs, args.learning_rate, validate=False)

    # save
    learner.save_model()

    # free memory and exit
    del learner
    return
Exemple #17
0
                          multi_gpu=True,
                          multi_label=True,
                          backend='nccl',
                          model_type='bert')

metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})
device_cuda = torch.device("cuda")
logger = logging.getLogger()

learner = BertLearner.from_pretrained_model(
    dataBunch=databunch,
    pretrained_path='./chinese_roberta_wwm_large_ext_pytorch',
    metrics=metrics,
    device=device_cuda,
    logger=logger,
    output_dir='./Data/loan/data/model/keda',
    multi_label=True)

learner.fit(
    epochs=6,
    lr=3e-5,
    validate=False,  # Evaluate the model after each epoch
    schedule_type="warmup_cosine",
    optimizer_type="lamb")

learner.save_model()

text_list = list(pd.read_csv('./Data/loan/new_data/test.csv')['text'].values)
output = learner.predict_batch(text_list)
Exemple #18
0
def create_model(columnm, epoch):
    if not os.path.exists(
            '/scratch/da2734/twitter/jobs/running_on_200Msamples/logs/log_binary_pos_neg_{}/'
            .format(column)):
        os.makedirs(
            '/scratch/da2734/twitter/jobs/running_on_200Msamples/logs/log_binary_pos_neg_{}/'
            .format(column))

    LOG_PATH = Path(
        '/scratch/da2734/twitter/jobs/running_on_200Msamples/logs/log_binary_pos_neg_{}/'
        .format(column))
    print('LOG_PATH', LOG_PATH)
    DATA_PATH = Path(
        '/scratch/da2734/twitter/data/may20_9Klabels/data_binary_pos_neg_balanced/'
    )
    LABEL_PATH = Path(
        '/scratch/da2734/twitter/data/may20_9Klabels/data_binary_pos_neg_balanced/'
    )
    OUTPUT_PATH = Path(
        '/scratch/da2734/twitter/jobs/training_binary/models_may20_9Klabels/output_{}'
        .format(column))
    FINETUNED_PATH = None

    args = Box({
        "run_text": "100Msamples",
        "train_size": -1,
        "val_size": -1,
        "log_path": LOG_PATH,
        "full_data_dir": DATA_PATH,
        "data_dir": DATA_PATH,
        "task_name": "labor_market_classification",
        "no_cuda": False,
        #     "bert_model": BERT_PRETRAINED_PATH,
        "output_dir": OUTPUT_PATH,
        "max_seq_length": 512,
        "do_train": True,
        "do_eval": True,
        "do_lower_case": True,
        "train_batch_size": 8,
        "eval_batch_size": 16,
        "learning_rate": 5e-5,
        "num_train_epochs": 100,
        "warmup_proportion": 0.0,
        "no_cuda": False,
        "local_rank": -1,
        "seed": 42,
        "gradient_accumulation_steps": 1,
        "optimize_on_cpu": False,
        "fp16": False,
        "fp16_opt_level": "O1",
        "weight_decay": 0.0,
        "adam_epsilon": 1e-8,
        "max_grad_norm": 1.0,
        "max_steps": -1,
        "warmup_steps": 500,
        "logging_steps": 50,
        "eval_all_checkpoints": True,
        "overwrite_output_dir": True,
        "overwrite_cache": True,
        "seed": 42,
        "loss_scale": 128,
        "task_name": 'intent',
        "model_name": 'bert-base-uncased',
        "model_type": 'bert'
    })

    import logging

    logfile = str(LOG_PATH /
                  'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        handlers=[
            logging.FileHandler(logfile),
            logging.StreamHandler(sys.stdout)
        ])

    logger = logging.getLogger()

    logger.info(args)

    device = torch.device('cuda')
    if torch.cuda.device_count() > 1:
        args.multi_gpu = True
    else:
        args.multi_gpu = False

    label_cols = ['class']

    databunch = BertDataBunch(
        args['data_dir'],
        LABEL_PATH,
        args.model_name,
        train_file='train_{}.csv'.format(column),
        val_file='val_{}.csv'.format(column),
        label_file='label_{}.csv'.format(column),
        # test_data='test.csv',
        text_col=
        "text",  # this is the name of the column in the train file that containts the tweet text
        label_col=label_cols,
        batch_size_per_gpu=args['train_batch_size'],
        max_seq_length=args['max_seq_length'],
        multi_gpu=args.multi_gpu,
        multi_label=False,
        model_type=args.model_type)

    num_labels = len(databunch.labels)
    print('num_labels', num_labels)

    print('time taken to load all this stuff:', str(time.time() - start_time),
          'seconds')

    # metrics defined: https://github.com/kaushaltrivedi/fast-bert/blob/d89e2aa01d948d6d3cdea7ad106bf5792fea7dfa/fast_bert/metrics.py
    metrics = []
    # metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
    # metrics.append({'name': 'roc_auc', 'function': roc_auc})
    # metrics.append({'name': 'fbeta', 'function': fbeta})
    metrics.append({'name': 'accuracy', 'function': accuracy})
    metrics.append({
        'name': 'roc_auc_save_to_plot_binary',
        'function': roc_auc_save_to_plot_binary
    })
    # metrics.append({'name': 'accuracy_multilabel', 'function': accuracy_multilabel})

    learner = BertLearner.from_pretrained_model(
        databunch,
        pretrained_path=
        '/scratch/da2734/twitter/jobs/training_binary/models_may20_9Klabels/output_{}/model_out_{}/'
        .format(column, epoch),
        metrics=metrics,
        device=device,
        logger=logger,
        output_dir=args.output_dir,
        finetuned_wgts_path=FINETUNED_PATH,
        warmup_steps=args.warmup_steps,
        multi_gpu=args.multi_gpu,
        is_fp16=args.fp16,
        multi_label=False,
        logging_steps=0)

    return learner
Exemple #19
0
        "is_relevant",
    ],
    batch_size_per_gpu=1,
    max_seq_length=2,
    multi_gpu=False,
    multi_label=True,
    model_type="bert",
)

learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path="bert-base-uncased",
    metrics=metrics,
    device=device_cuda,
    logger=logger,
    output_dir=OUTPUT_DIR,
    finetuned_wgts_path=None,
    multi_gpu=False,
    is_fp16=False,
    multi_label=True,
    logging_steps=50,
)

learner.fit(
    epochs=6,
    lr=6e-5,
    # validate=True, 	# Evaluate the model after each epoch
    # # schedule_type="warmup_cosine",
    optimizer_type="adamW",
)
learner.save_model()
                          batch_size_per_gpu=args['train_batch_size'],
                          max_seq_length=args['max_seq_length'],
                          multi_gpu=args.multi_gpu,
                          multi_label=True,
                          model_type=args.model_type)

print(databunch.train_dl.dataset[0][3])
num_labels = len(databunch.labels)
print(num_labels)

metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})

learner = BertLearner.from_pretrained_model(databunch,
                                            args.model_name,
                                            metrics=metrics,
                                            device=device,
                                            logger=logger,
                                            output_dir=args.output_dir,
                                            finetuned_wgts_path=FINETUNED_PATH,
                                            warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu,
                                            is_fp16=args.fp16,
                                            multi_label=True,
                                            logging_steps=0)

learner.validate()
learner.save_model()
def train_fast_bert():

    MAX_LEN = 512  # previous model was 300

    text_col = 'script'
    label_col = [
        'Action', 'Adventure', 'Comedy', 'Crime', 'Drama', 'Fantasy', 'Horror',
        'Mystery', 'Romance', 'Sci-Fi', 'Thriller'
    ]
    DATA_PATH = Path('./data/')
    LABEL_PATH = DATA_PATH

    train_file = 'fast_train_' + str(MAX_LEN) + '.csv'
    val_file = 'fast_val_' + str(MAX_LEN) + '.csv'

    goodtogo = check_fastBert_data(MAX_LEN)
    if not goodtogo: die()

    MODEL_NAME = 'bert-base-uncased'

    databunch = BertDataBunch(DATA_PATH,
                              LABEL_PATH,
                              tokenizer=MODEL_NAME,
                              train_file=train_file,
                              val_file=val_file,
                              label_file='fast_labels.csv',
                              text_col=text_col,
                              label_col=label_col,
                              batch_size_per_gpu=16,
                              max_seq_length=MAX_LEN,
                              multi_gpu=False,
                              multi_label=True,
                              model_type='bert')

    # **NOTE** remember to change `usePretrained` to True if we've already have a fine-tuned model

    def my_accuracy_thresh(
        y_pred: Tensor,
        y_true: Tensor,
        thresh: float = 0.7,
        sigmoid: bool = False,
    ):
        "Compute accuracy when `y_pred` and `y_true` are the same size."
        if sigmoid:
            y_pred = y_pred.sigmoid()
        return ((y_pred > thresh) == y_true.bool()).float().mean().item()

    logging.basicConfig(level=logging.NOTSET)
    logger = logging.getLogger()
    device_cuda = torch.device("cuda")
    metrics = [{'name': 'accuracy_thresh', 'function': my_accuracy_thresh}]

    OUTPUTDIR = Path('./models/')

    MODEL_PATH = OUTPUTDIR / 'model_out_bert_cased'

    usePretrained = False
    if usePretrained:
        pretrained_path = MODEL_PATH
    else:
        pretrained_path = 'bert-base-uncased'

    # Setting up apex properly on Colab required dowgrading Torch version (check first block of notebook for details)
    learner = BertLearner.from_pretrained_model(
        databunch,
        pretrained_path=usePretrained,  #MODEL_PATH #(to use saved model)
        metrics=metrics,
        device=device_cuda,
        logger=logger,
        output_dir=OUTPUTDIR,
        finetuned_wgts_path=None,
        warmup_steps=500,
        multi_gpu=False,
        is_fp16=False,  # need apex setup properly for this (note above)
        multi_label=True,
        logging_steps=50)

    learner.fit(
        epochs=5,
        lr=6e-4,
        validate=True,  # Evaluate the model after each epoch
        schedule_type="warmup_cosine",
        optimizer_type="lamb")