def train(path_to_directory, model): DATA_PATH = BASE / path_to_directory OUTPUT_DIR = DATA_PATH / 'output' / model OUTPUT_DIR.mkdir(parents=True, exist_ok=True) if (model == "biobert"): tokenizer = BertTokenizer.from_pretrained(BIOBERT_PATH, do_lower_case=True) pretrained_path = BIOBERT_PATH elif (model == "bert"): tokenizer = "bert-base-uncased" pretrained_path = "bert-base-uncased" else: print("Model parameter must be either 'bert' or 'biobert'") return databunch = BertDataBunch(DATA_PATH, LABEL_PATH, tokenizer=tokenizer, train_file='train.csv', val_file='val.csv', text_col='text', label_file='labels.csv', label_col=labels, batch_size_per_gpu=10, max_seq_length=512, multi_gpu=multi_gpu, multi_label=True, model_type='bert', clear_cache=True) learner = BertLearner.from_pretrained_model( databunch, pretrained_path=pretrained_path, metrics=metrics, device=device_cuda, logger=logger, output_dir=OUTPUT_DIR, finetuned_wgts_path=None, warmup_steps=500, multi_gpu=multi_gpu, is_fp16=True, multi_label=True, logging_steps=20) if path_to_directory.split('/', 1)[1] in ['original', 'synthetic']: epochs = 20 else: epochs = 10 learner.fit( epochs=epochs, lr=6e-5, validate=True, # Evaluate the model after each epoch schedule_type="warmup_cosine") learner.save_model() return
def __init__(self): databunch = BertDataBunch('train', 'train', tokenizer='distilbert-base-uncased', train_file='train.csv', val_file='val.csv', label_file='labels.csv', text_col='text', label_col='label', batch_size_per_gpu=8, max_seq_length=512, multi_gpu=False, multi_label=False, model_type='distilbert') device_cuda = torch.device("cuda") metrics = [{'name': 'accuracy', 'function': accuracy}] logger = logging.getLogger() self.learner = BertLearner.from_pretrained_model(databunch, pretrained_path='distilbert-base-uncased', metrics=metrics, device=device_cuda, output_dir='models', warmup_steps=100, logger=logger, multi_gpu=False, is_fp16=False, # install apex to use fp16 training multi_label=False, logging_steps=0)
async def setup_learner(): await download_file(pretrained_link, path / modelname), #await download_file(vocablink, path / vocab), #await download_file(sptokenlink, path / sptoken), #await download_file(tokenlink, path / token), #await download_file(configlink, path / config), #await download_file(l2link, path / l2) try: data_bunch = BertDataBunch(path, path, tokenizer=path, train_file=None, val_file=None, label_file='l2.csv', batch_size_per_gpu=120, max_seq_length=40, multi_gpu=False, multi_label=False, model_type='bert') learner = BertLearner.from_pretrained_model(data_bunch, pretrained_path=path, metrics=[], device='cpu', logger=None, output_dir=None, is_fp16=False) return learner except RuntimeError as e: if len(e.args) > 0 and 'CPU-only machine' in e.args[0]: print(e) message = "\n\nThis model was trained with an old version of fastai and will not work in a CPU environment.\n\nPlease update the fastai library in your training environment and export your model again.\n\nSee instructions for 'Returning to work' at https://course.fast.ai." raise RuntimeError(message) else: raise
def train(args): if args.is_onepanel: args.out_dir = os.path.join("/onepanel/output/",args.out_dir) if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) logger = logging.getLogger() labels = ["anger", "anticipation","disgust","fear","joy","love","optimism","pessimism","sadness","surprise","trust","neutral"] databunch = BertDataBunch(".", ".", tokenizer=args.pretrained_model, train_file='nlp_train.csv', label_file='labels.csv', val_file="nlp_valid.csv", text_col='text', label_col=labels, batch_size_per_gpu=args.batch_size, max_seq_length=512, multi_gpu=False, multi_label=True, model_type='bert') device_cuda = torch.device("cuda") metrics = [{'name': 'accuracy', 'function': accuracy}] learner = BertLearner.from_pretrained_model( databunch, pretrained_path=args.pretrained_model, metrics=metrics, device=device_cuda, logger=logger, output_dir=args.out_dir, finetuned_wgts_path=None, warmup_steps=200, multi_gpu=False, is_fp16=False, multi_label=True, logging_steps=10) learner.fit(epochs=args.epochs, lr=2e-3, schedule_type="warmup_cosine_hard_restarts", optimizer_type="lamb") # validate=True) learner.save_model()
tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] # Setup and train the BERT model device_cuda = torch.device('cpu') logger.debug('Create Data Bunch') databunch = BertDataBunch(DATA_PATH, LABEL_PATH, tokenizer='bert-base-uncased', train_file='train.csv', val_file='val.csv', label_file='labels.csv', text_col='text', label_col='label', max_seq_length=512, batch_size_per_gpu=128, multi_gpu=False, multi_label=False, model_type='bert') logger.debug('Opening Leaner') learner = BertLearner.from_pretrained_model( databunch, pretrained_path='bert-base-uncased', metrics=metrics, device=device_cuda, logger=logger, output_dir=OUTPUT_DIR, finetuned_wgts_path=None,
if torch.cuda.device_count() > 1: args.multi_gpu = True else: args.multi_gpu = False # 设定索要的标签 label_cols = (sentence_labels if train_for == 'sentence' else fine_grained_labels) # 构建 databunch databunch = BertDataBunch(args['data_dir'], LABEL_PATH, tokenizer, train_file='train.csv', val_file='val.csv', text_col='text', label_col=label_cols, batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type) # 测试一下 databunch print(''.center(31, '*')) print(databunch.train_dl.dataset[0][3]) print(len(databunch.labels)) # 配置分布式 # torch.distributed.init_process_group( # backend="nccl", # init_method="tcp://localhost:23459",
def F1_micro(y_pred: Tensor, y_true: Tensor): return F1_macro(y_pred.cpu(), y_true.cpu(), average='micro') logging.basicConfig(level=logging.NOTSET) logger = logging.getLogger() databunch = BertDataBunch('./', './', tokenizer='bert-base-german-dbmdz-cased', train_file='train.csv', val_file='dev.csv', label_file='labels.csv', text_col='text', label_col='label', batch_size_per_gpu=64, max_seq_length=128, multi_gpu=False, multi_label=False, model_type='bert') device_cuda = torch.device('cuda') metrics = [] metrics.append({'name': 'accuracy', 'function': accuracy}) metrics.append({'name': 'F1_macro', 'function': F1_macro}) metrics.append({'name': 'F1_micro', 'function': F1_micro}) learner = BertLearner.from_pretrained_model(
logger = logging.getLogger() device_cuda = torch.device("cuda") # from transformers import BertTokenizer # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') LABEL_PATH = DATA_PATH = r'./data' label_cols = ["anger", "fear", "joy", "sadness", "surprise"] databunch = BertDataBunch( DATA_PATH, LABEL_PATH, tokenizer='bert-base-uncased', train_file='train.csv', val_file='valid.csv', # val.csv label_file='labels.csv', text_col='content', label_col=label_cols, batch_size_per_gpu=2, max_seq_length=512, multi_gpu=True, multi_label=True, model_type='bert') from fast_bert.metrics import accuracy_multilabel from fast_bert.learner_cls import BertLearner metrics = [{'name': 'accuracy', 'function': accuracy_multilabel}] learner = BertLearner.from_pretrained_model( databunch, pretrained_path='bert-base-uncased', metrics=metrics, device=device_cuda,
def main(): parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default=None, type=str, required=True, help="Path specifying the location of the dataset") parser.add_argument( "--label_dir", default=None, type=str, required=True, help="Path specifying the location of the labels.csv file") parser.add_argument( "--output_dir", default=None, required=True, type=str, help="Path specifying the location to save the results") parser.add_argument("--text_col", default=None, required=True, type=str, help="The column name of the text") parser.add_argument("--batch_size", default=16, required=False, type=int, help="Batch size per GPU") parser.add_argument( "--max_seq_len", default=320, required=False, type=int, help="Maximum length of the token sequence to input to BERT") parser.add_argument("--multi_gpu", default=False, required=False, type=bool, help="Whether to use multi-gpu for training") parser.add_argument("--epochs", default=6, type=int, required=False, help="Number of epochs to train") parser.add_argument("--lr", default=6e-5, type=float, required=False, help="Initial learning rate for training") parser.add_argument("--save_model", required=False, default=None, help="Whether to save the model or not") parser.add_argument("--eval", required=False, type=bool, default=True, help="Whether to run evaluation after each epoch") args = parser.parse_args() DATA_PATH = args.data_dir LABEL_PATH = args.label_dir OUTPUT_PATH = args.output_dir EPOCHS = args.epochs LR = args.lr EVAL = args.eval TEXT_COL = args.text_col BATCH_SIZE = args.batch_size MAX_SEQ_LEN = args.max_seq_len MULTI_GPU = args.multi_gpu labels = pd.read_csv(os.path.join(DATA_PATH, 'labels.csv'), header=None).values LABEL_LIST = [val[0] for val in labels] databunch = BertDataBunch(DATA_PATH, LABEL_PATH, tokenizer='bert-base-uncased', train_file='m_aspect_train.csv', val_file='m_aspect_test.csv', label_file='labels.csv', text_col=TEXT_COL, label_col=LABEL_LIST, batch_size_per_gpu=BATCH_SIZE, max_seq_length=MAX_SEQ_LEN, multi_gpu=MULTI_GPU, multi_label=True, model_type='bert', no_cache=True) # display(databunch.get_dl_from_texts) device_cuda = torch.device("cuda") metrics = [{'name': 'accuracy', 'function': accuracy_multilabel}] learner = BertLearner.from_pretrained_model( databunch, pretrained_path='bert-base-uncased', metrics=metrics, device=device_cuda, logger=None, output_dir=OUTPUT_PATH, finetuned_wgts_path=None, warmup_steps=500, multi_gpu=MULTI_GPU, is_fp16=False, multi_label=True, logging_steps=50) global_step, loss = learner.fit( epochs=EPOCHS, lr=LR, validate=EVAL, # Evaluate the model after each epoch schedule_type="warmup_cosine", optimizer_type="lamb") print("global_Step:", global_step, "loss:", loss) if args.save_model: learner.save_model()
def create_model(columnm, epoch): if not os.path.exists( '/scratch/da2734/twitter/jobs/running_on_200Msamples/logs/log_binary_pos_neg_{}/' .format(column)): os.makedirs( '/scratch/da2734/twitter/jobs/running_on_200Msamples/logs/log_binary_pos_neg_{}/' .format(column)) LOG_PATH = Path( '/scratch/da2734/twitter/jobs/running_on_200Msamples/logs/log_binary_pos_neg_{}/' .format(column)) print('LOG_PATH', LOG_PATH) DATA_PATH = Path( '/scratch/da2734/twitter/data/may20_9Klabels/data_binary_pos_neg_balanced/' ) LABEL_PATH = Path( '/scratch/da2734/twitter/data/may20_9Klabels/data_binary_pos_neg_balanced/' ) OUTPUT_PATH = Path( '/scratch/da2734/twitter/jobs/training_binary/models_may20_9Klabels/output_{}' .format(column)) FINETUNED_PATH = None args = Box({ "run_text": "100Msamples", "train_size": -1, "val_size": -1, "log_path": LOG_PATH, "full_data_dir": DATA_PATH, "data_dir": DATA_PATH, "task_name": "labor_market_classification", "no_cuda": False, # "bert_model": BERT_PRETRAINED_PATH, "output_dir": OUTPUT_PATH, "max_seq_length": 512, "do_train": True, "do_eval": True, "do_lower_case": True, "train_batch_size": 8, "eval_batch_size": 16, "learning_rate": 5e-5, "num_train_epochs": 100, "warmup_proportion": 0.0, "no_cuda": False, "local_rank": -1, "seed": 42, "gradient_accumulation_steps": 1, "optimize_on_cpu": False, "fp16": False, "fp16_opt_level": "O1", "weight_decay": 0.0, "adam_epsilon": 1e-8, "max_grad_norm": 1.0, "max_steps": -1, "warmup_steps": 500, "logging_steps": 50, "eval_all_checkpoints": True, "overwrite_output_dir": True, "overwrite_cache": True, "seed": 42, "loss_scale": 128, "task_name": 'intent', "model_name": 'bert-base-uncased', "model_type": 'bert' }) import logging logfile = str(LOG_PATH / 'log-{}-{}.txt'.format(run_start_time, args["run_text"])) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', handlers=[ logging.FileHandler(logfile), logging.StreamHandler(sys.stdout) ]) logger = logging.getLogger() logger.info(args) device = torch.device('cuda') if torch.cuda.device_count() > 1: args.multi_gpu = True else: args.multi_gpu = False label_cols = ['class'] databunch = BertDataBunch( args['data_dir'], LABEL_PATH, args.model_name, train_file='train_{}.csv'.format(column), val_file='val_{}.csv'.format(column), label_file='label_{}.csv'.format(column), # test_data='test.csv', text_col= "text", # this is the name of the column in the train file that containts the tweet text label_col=label_cols, batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], multi_gpu=args.multi_gpu, multi_label=False, model_type=args.model_type) num_labels = len(databunch.labels) print('num_labels', num_labels) print('time taken to load all this stuff:', str(time.time() - start_time), 'seconds') # metrics defined: https://github.com/kaushaltrivedi/fast-bert/blob/d89e2aa01d948d6d3cdea7ad106bf5792fea7dfa/fast_bert/metrics.py metrics = [] # metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh}) # metrics.append({'name': 'roc_auc', 'function': roc_auc}) # metrics.append({'name': 'fbeta', 'function': fbeta}) metrics.append({'name': 'accuracy', 'function': accuracy}) metrics.append({ 'name': 'roc_auc_save_to_plot_binary', 'function': roc_auc_save_to_plot_binary }) # metrics.append({'name': 'accuracy_multilabel', 'function': accuracy_multilabel}) learner = BertLearner.from_pretrained_model( databunch, pretrained_path= '/scratch/da2734/twitter/jobs/training_binary/models_may20_9Klabels/output_{}/model_out_{}/' .format(column, epoch), metrics=metrics, device=device, logger=logger, output_dir=args.output_dir, finetuned_wgts_path=FINETUNED_PATH, warmup_steps=args.warmup_steps, multi_gpu=args.multi_gpu, is_fp16=args.fp16, multi_label=False, logging_steps=0) return learner
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features from fast_bert.learner_cls import BertLearner from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc label_cols = ['LN' + str(i) for i in range(1, 22)] tokenizer = BertTokenizer( vocab_file='./chinese_roberta_wwm_large_ext_pytorch/vocab.txt') databunch = BertDataBunch(data_dir='./Data/loan/new_data', label_dir='./Data/loan/new_data', tokenizer=tokenizer, train_file='train.csv', val_file='', label_file='labels.csv', text_col='text', label_col=label_cols, batch_size_per_gpu=4, max_seq_length=512, multi_gpu=True, multi_label=True, backend='nccl', model_type='bert') metrics = [] metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh}) metrics.append({'name': 'roc_auc', 'function': roc_auc}) metrics.append({'name': 'fbeta', 'function': fbeta}) device_cuda = torch.device("cuda") logger = logging.getLogger() learner = BertLearner.from_pretrained_model(
run_start_time = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S") logfile = str("log/log-{}-{}.txt".format(run_start_time, "bert_base_uncased")) logger = logging.getLogger() device_cuda = torch.device("cuda") metrics = [{"name": "Acc", "function": accuracy}] databunch = BertDataBunch( data_path, label_path, tokenizer=tokenizer, train_file="train_combined.csv", val_file=val_file, label_file="labels.csv", text_col="text", label_col="label", batch_size_per_gpu=32, max_seq_length=128, multi_gpu=False, multi_label=False, model_type="bert", ) learner = BertLearner.from_pretrained_model( databunch, pretrained_path=pretrained_path, metrics=metrics, device=device_cuda, logger=logger, output_dir=out_path,
from fast_bert.data_cls import BertDataBunch from fast_bert.learner_cls import BertLearner from fast_bert.metrics import accuracy import logging PRETRAINED_PATH = '../../bert_model/model_out/' tokenizer = BertTokenizer.from_pretrained(PRETRAINED_PATH, do_lower_case=False) databunch = BertDataBunch('../../../dados/', '../../../dados/', tokenizer, train_file='bert_train_pt.csv', val_file='bert_val_pt.csv', label_file='labels.csv', text_col='text', label_col='label', batch_size_per_gpu=768, max_seq_length=15, multi_gpu=False, multi_label=False, model_type='bert') OUTPUT_DIR = '../../bert_model/' logger = logging.getLogger() device_cuda = torch.device("cuda") metrics = [{'name': 'accuracy', 'function': accuracy}] learner = BertLearner.from_pretrained_model(databunch, pretrained_path=PRETRAINED_PATH, metrics=metrics,
from fast_bert.data_cls import BertDataBunch DATA_PATH = "data" LABEL_PATH = "label" databunch = BertDataBunch(DATA_PATH, LABEL_PATH, tokenizer='bert-base-uncased', train_file='train_sample.csv', val_file='val_sample.csv', label_file='labels.csv', text_col='text', label_col=[ 'location', 'chat', 'time', 'personal', 'camera', 'app', ], batch_size_per_gpu=1, max_seq_length=512, multi_gpu=False, multi_label=True, model_type='bert') import torch from fast_bert.learner_cls import BertLearner from fast_bert.metrics import accuracy import logging logger = logging.getLogger()
'log-{}-{}.txt'.format(run_start_time, 'doc_rerank_bioasq7')) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', handlers=[logging.FileHandler(logfile), logging.StreamHandler(sys.stdout)]) logger = logging.getLogger() databunch = BertDataBunch( DATA_PATH, LABEL_PATH, tokenizer='bert-base-uncased', train_file='/home/dpappas/fast_bert_models/doc_rerank/train.csv', val_file='/home/dpappas/fast_bert_models/doc_rerank/val.csv', label_file='/home/dpappas/fast_bert_models/doc_rerank/labels.csv', text_col='text', label_col='label', batch_size_per_gpu=6, max_seq_length=512, multi_gpu=multi_gpu, multi_label=False, model_type='bert') metrics = [] metrics.append({'name': 'accuracy', 'function': accuracy}) metrics.append({'name': 'roc_auc', 'function': roc_auc_2}) metrics.append({'name': 'fbeta', 'function': fbeta_2}) learner = BertLearner.from_pretrained_model( databunch, pretrained_path='bert-base-uncased',
def train_bert(experiment_parameters, args): # logging run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S') logfile = str(experiment_parameters.LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"])) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', handlers=[ logging.FileHandler(logfile), logging.StreamHandler(sys.stdout) ]) logger = logging.getLogger() # cuda device = torch.device('cuda') if torch.cuda.device_count() > 1: args.multi_gpu = True else: args.multi_gpu = False print() print('BERT training file: ',args['data_dir'],'train.csv') # create a fast-bert-specific data format torch.manual_seed(args.seed) databunch = BertDataBunch(args['data_dir'], experiment_parameters.LABEL_PATH, experiment_parameters.tokenizer, train_file='train.csv', val_file=None,#'test.csv', test_data='test.csv', text_col="comment_text", label_col=experiment_parameters.LABEL_COLS, batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type, clear_cache=False) metrics = [] metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh}) metrics.append({'name': 'roc_auc', 'function': roc_auc}) metrics.append({'name': 'fbeta', 'function': fbeta}) # create learner object learner = BertLearner.from_pretrained_model(databunch, args.model_name, metrics=metrics, device=device, logger=logger, output_dir=args.output_dir, finetuned_wgts_path=experiment_parameters.FINETUNED_PATH, warmup_steps=args.warmup_steps, multi_gpu=args.multi_gpu, is_fp16=args.fp16, multi_label=True, logging_steps=0) # train torch.manual_seed(args.seed) learner.fit(args.num_train_epochs, args.learning_rate, validate=False) # save learner.save_model() # free memory and exit del learner return
import sys import logging logging.basicConfig(level=logging.DEBUG) DATA_PATH = sys.argv[1] LABEL_PATH = sys.argv[2] OUTPUT_DIR = sys.argv[3] databunch = BertDataBunch(DATA_PATH, LABEL_PATH, tokenizer='roberta-base', train_file='train.csv', val_file='valid.csv', label_file='labels.csv', text_col='text', label_col='label', batch_size_per_gpu=64, max_seq_length=50, multi_gpu=False, multi_label=False, model_type='roberta') logger = logging.getLogger() device_cuda = torch.device("cuda") metrics = [{'name': 'accuracy', 'function': accuracy}] learner = BertLearner.from_pretrained_model(databunch, pretrained_path='roberta-base', metrics=metrics, device=device_cuda,
logger = logging.getLogger() device_cuda = torch.device("cuda") metrics = [{"name": "accuracy", "function": accuracy}] databunch = BertDataBunch( DATA_PATH, LABEL_PATH, tokenizer="bert-base-uncased", train_file="train.csv", val_file="val.csv", label_file="labels.csv", text_col="text", label_col=[ "domestic", "county", "city", "regional", "state", "national", "international", "is_relevant", ], batch_size_per_gpu=1, max_seq_length=2, multi_gpu=False, multi_label=True, model_type="bert", ) learner = BertLearner.from_pretrained_model( databunch, pretrained_path="bert-base-uncased",
args.multi_gpu = False label_cols = [ "job_loss", "is_unemployed", "job_search", "is_hired", "job_offer" ] # label_cols = ['pos', 'neg'] # label_cols = ['pos'] databunch = BertDataBunch( args['data_dir'], LABEL_PATH, args.model_name, train_file='train.csv', val_file='val.csv', label_file='labels.csv', # test_data='test.csv', text_col= "text", # this is the name of the column in the train file that containts the tweet text label_col=label_cols, batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type) num_labels = len(databunch.labels) print('num_labels', num_labels) print('time taken to load all this stuff:', str(time.time() - start_time), 'seconds') # metrics defined: https://github.com/kaushaltrivedi/fast-bert/blob/d89e2aa01d948d6d3cdea7ad106bf5792fea7dfa/fast_bert/metrics.py
###################### DATA_PATH = Path('./sample_data/multi_label_toxic_comments/data/') # path for data files (train and val) LABEL_PATH = Path('./sample_data/multi_label_toxic_comments/label/') # path for labels file MODEL_PATH=Path('../models/') # path for model artifacts to be stored LOG_PATH=Path('../logs/') # path for log files to be stored databunch = BertDataBunch(DATA_PATH, LABEL_PATH, tokenizer='bert-base-uncased', train_file='train.csv', val_file='val.csv', label_file='labels.csv', text_col='text', label_col=['toxic','severe_toxic','obscene','threat','insult','identity_hate'], batch_size_per_gpu=8, max_seq_length=512, multi_gpu=True, multi_label=True, model_type='bert') ###################### ### ### ### LEARNER ### ### ### ######################
level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', handlers=[logging.FileHandler(logfile), logging.StreamHandler(sys.stdout)]) logger = logging.getLogger() logger.info(args) databunch = BertDataBunch(args['data_dir'], LABEL_PATH, args.model_name, train_file='train.csv', val_file='val.csv', test_data='test.csv', text_col="comment_text", label_col=label_cols, batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type) print(databunch.train_dl.dataset[0][3]) num_labels = len(databunch.labels) print(num_labels) metrics = [] metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh}) metrics.append({'name': 'roc_auc', 'function': roc_auc}) metrics.append({'name': 'fbeta', 'function': fbeta})
def train_fast_bert(): MAX_LEN = 512 # previous model was 300 text_col = 'script' label_col = [ 'Action', 'Adventure', 'Comedy', 'Crime', 'Drama', 'Fantasy', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller' ] DATA_PATH = Path('./data/') LABEL_PATH = DATA_PATH train_file = 'fast_train_' + str(MAX_LEN) + '.csv' val_file = 'fast_val_' + str(MAX_LEN) + '.csv' goodtogo = check_fastBert_data(MAX_LEN) if not goodtogo: die() MODEL_NAME = 'bert-base-uncased' databunch = BertDataBunch(DATA_PATH, LABEL_PATH, tokenizer=MODEL_NAME, train_file=train_file, val_file=val_file, label_file='fast_labels.csv', text_col=text_col, label_col=label_col, batch_size_per_gpu=16, max_seq_length=MAX_LEN, multi_gpu=False, multi_label=True, model_type='bert') # **NOTE** remember to change `usePretrained` to True if we've already have a fine-tuned model def my_accuracy_thresh( y_pred: Tensor, y_true: Tensor, thresh: float = 0.7, sigmoid: bool = False, ): "Compute accuracy when `y_pred` and `y_true` are the same size." if sigmoid: y_pred = y_pred.sigmoid() return ((y_pred > thresh) == y_true.bool()).float().mean().item() logging.basicConfig(level=logging.NOTSET) logger = logging.getLogger() device_cuda = torch.device("cuda") metrics = [{'name': 'accuracy_thresh', 'function': my_accuracy_thresh}] OUTPUTDIR = Path('./models/') MODEL_PATH = OUTPUTDIR / 'model_out_bert_cased' usePretrained = False if usePretrained: pretrained_path = MODEL_PATH else: pretrained_path = 'bert-base-uncased' # Setting up apex properly on Colab required dowgrading Torch version (check first block of notebook for details) learner = BertLearner.from_pretrained_model( databunch, pretrained_path=usePretrained, #MODEL_PATH #(to use saved model) metrics=metrics, device=device_cuda, logger=logger, output_dir=OUTPUTDIR, finetuned_wgts_path=None, warmup_steps=500, multi_gpu=False, is_fp16=False, # need apex setup properly for this (note above) multi_label=True, logging_steps=50) learner.fit( epochs=5, lr=6e-4, validate=True, # Evaluate the model after each epoch schedule_type="warmup_cosine", optimizer_type="lamb")