def train(path_to_directory, model):

    DATA_PATH = BASE / path_to_directory
    OUTPUT_DIR = DATA_PATH / 'output' / model
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    if (model == "biobert"):
        tokenizer = BertTokenizer.from_pretrained(BIOBERT_PATH,
                                                  do_lower_case=True)
        pretrained_path = BIOBERT_PATH
    elif (model == "bert"):
        tokenizer = "bert-base-uncased"
        pretrained_path = "bert-base-uncased"
    else:
        print("Model parameter must be either 'bert' or 'biobert'")
        return

    databunch = BertDataBunch(DATA_PATH,
                              LABEL_PATH,
                              tokenizer=tokenizer,
                              train_file='train.csv',
                              val_file='val.csv',
                              text_col='text',
                              label_file='labels.csv',
                              label_col=labels,
                              batch_size_per_gpu=10,
                              max_seq_length=512,
                              multi_gpu=multi_gpu,
                              multi_label=True,
                              model_type='bert',
                              clear_cache=True)

    learner = BertLearner.from_pretrained_model(
        databunch,
        pretrained_path=pretrained_path,
        metrics=metrics,
        device=device_cuda,
        logger=logger,
        output_dir=OUTPUT_DIR,
        finetuned_wgts_path=None,
        warmup_steps=500,
        multi_gpu=multi_gpu,
        is_fp16=True,
        multi_label=True,
        logging_steps=20)

    if path_to_directory.split('/', 1)[1] in ['original', 'synthetic']:
        epochs = 20
    else:
        epochs = 10

    learner.fit(
        epochs=epochs,
        lr=6e-5,
        validate=True,  # Evaluate the model after each epoch
        schedule_type="warmup_cosine")

    learner.save_model()

    return
Example #2
0
    def __init__(self):
        databunch = BertDataBunch('train',
                                  'train',
                                  tokenizer='distilbert-base-uncased',
                                  train_file='train.csv',
                                  val_file='val.csv',
                                  label_file='labels.csv',
                                  text_col='text',
                                  label_col='label',
                                  batch_size_per_gpu=8,
                                  max_seq_length=512,
                                  multi_gpu=False,
                                  multi_label=False,
                                  model_type='distilbert')

        device_cuda = torch.device("cuda")
        metrics = [{'name': 'accuracy', 'function': accuracy}]
        logger = logging.getLogger()

        self.learner = BertLearner.from_pretrained_model(databunch,
                                                         pretrained_path='distilbert-base-uncased',
                                                         metrics=metrics,
                                                         device=device_cuda,
                                                         output_dir='models',
                                                         warmup_steps=100,
                                                         logger=logger,
                                                         multi_gpu=False,
                                                         is_fp16=False,  # install apex to use fp16 training
                                                         multi_label=False,
                                                         logging_steps=0)
Example #3
0
async def setup_learner():
    await download_file(pretrained_link, path / modelname),
    #await download_file(vocablink, path / vocab),
    #await download_file(sptokenlink, path / sptoken),
    #await download_file(tokenlink, path / token),
    #await download_file(configlink, path / config),
    #await download_file(l2link, path / l2)
    try:
        data_bunch = BertDataBunch(path,
                                   path,
                                   tokenizer=path,
                                   train_file=None,
                                   val_file=None,
                                   label_file='l2.csv',
                                   batch_size_per_gpu=120,
                                   max_seq_length=40,
                                   multi_gpu=False,
                                   multi_label=False,
                                   model_type='bert')

        learner = BertLearner.from_pretrained_model(data_bunch,
                                                    pretrained_path=path,
                                                    metrics=[],
                                                    device='cpu',
                                                    logger=None,
                                                    output_dir=None,
                                                    is_fp16=False)
        return learner
    except RuntimeError as e:
        if len(e.args) > 0 and 'CPU-only machine' in e.args[0]:
            print(e)
            message = "\n\nThis model was trained with an old version of fastai and will not work in a CPU environment.\n\nPlease update the fastai library in your training environment and export your model again.\n\nSee instructions for 'Returning to work' at https://course.fast.ai."
            raise RuntimeError(message)
        else:
            raise
def train(args):
	if args.is_onepanel:
		args.out_dir = os.path.join("/onepanel/output/",args.out_dir)
	if not os.path.exists(args.out_dir):
		os.mkdir(args.out_dir)

	logger = logging.getLogger()
	labels = ["anger", "anticipation","disgust","fear","joy","love","optimism","pessimism","sadness","surprise","trust","neutral"]
	databunch = BertDataBunch(".", ".",
								tokenizer=args.pretrained_model,
								train_file='nlp_train.csv',
								label_file='labels.csv',
								val_file="nlp_valid.csv",
								text_col='text',
								label_col=labels,
								batch_size_per_gpu=args.batch_size,
								max_seq_length=512,
								multi_gpu=False,
								multi_label=True,
								model_type='bert')

	device_cuda = torch.device("cuda")
	metrics = [{'name': 'accuracy', 'function': accuracy}]

	learner = BertLearner.from_pretrained_model(
							databunch,
							pretrained_path=args.pretrained_model,
							metrics=metrics,
							device=device_cuda,
							logger=logger,
							output_dir=args.out_dir,
							finetuned_wgts_path=None,
							warmup_steps=200,
							multi_gpu=False,
							is_fp16=False,
							multi_label=True,
							logging_steps=10)

	learner.fit(epochs=args.epochs,
				lr=2e-3,
				schedule_type="warmup_cosine_hard_restarts",
				optimizer_type="lamb")
				# validate=True)
	learner.save_model()
    tokenizer.encode("Hello, my dog is cute",
                     add_special_tokens=True)).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]

# Setup and train the BERT model
device_cuda = torch.device('cpu')
logger.debug('Create Data Bunch')
databunch = BertDataBunch(DATA_PATH,
                          LABEL_PATH,
                          tokenizer='bert-base-uncased',
                          train_file='train.csv',
                          val_file='val.csv',
                          label_file='labels.csv',
                          text_col='text',
                          label_col='label',
                          max_seq_length=512,
                          batch_size_per_gpu=128,
                          multi_gpu=False,
                          multi_label=False,
                          model_type='bert')
logger.debug('Opening Leaner')
learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path='bert-base-uncased',
    metrics=metrics,
    device=device_cuda,
    logger=logger,
    output_dir=OUTPUT_DIR,
    finetuned_wgts_path=None,
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

# 设定索要的标签
label_cols = (sentence_labels
              if train_for == 'sentence' else fine_grained_labels)

# 构建 databunch
databunch = BertDataBunch(args['data_dir'],
                          LABEL_PATH,
                          tokenizer,
                          train_file='train.csv',
                          val_file='val.csv',
                          text_col='text',
                          label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'],
                          max_seq_length=args['max_seq_length'],
                          multi_gpu=args.multi_gpu,
                          multi_label=True,
                          model_type=args.model_type)

# 测试一下 databunch
print(''.center(31, '*'))
print(databunch.train_dl.dataset[0][3])
print(len(databunch.labels))

# 配置分布式
# torch.distributed.init_process_group(
#     backend="nccl",
#     init_method="tcp://localhost:23459",
Example #7
0

def F1_micro(y_pred: Tensor, y_true: Tensor):
    return F1_macro(y_pred.cpu(), y_true.cpu(), average='micro')


logging.basicConfig(level=logging.NOTSET)
logger = logging.getLogger()

databunch = BertDataBunch('./',
                          './',
                          tokenizer='bert-base-german-dbmdz-cased',
                          train_file='train.csv',
                          val_file='dev.csv',
                          label_file='labels.csv',
                          text_col='text',
                          label_col='label',
                          batch_size_per_gpu=64,
                          max_seq_length=128,
                          multi_gpu=False,
                          multi_label=False,
                          model_type='bert')

device_cuda = torch.device('cuda')

metrics = []
metrics.append({'name': 'accuracy', 'function': accuracy})
metrics.append({'name': 'F1_macro', 'function': F1_macro})
metrics.append({'name': 'F1_micro', 'function': F1_micro})

learner = BertLearner.from_pretrained_model(
Example #8
0
logger = logging.getLogger()
device_cuda = torch.device("cuda")
# from transformers import BertTokenizer

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

LABEL_PATH = DATA_PATH = r'./data'

label_cols = ["anger", "fear", "joy", "sadness", "surprise"]
databunch = BertDataBunch(
    DATA_PATH,
    LABEL_PATH,
    tokenizer='bert-base-uncased',
    train_file='train.csv',
    val_file='valid.csv',  # val.csv
    label_file='labels.csv',
    text_col='content',
    label_col=label_cols,
    batch_size_per_gpu=2,
    max_seq_length=512,
    multi_gpu=True,
    multi_label=True,
    model_type='bert')

from fast_bert.metrics import accuracy_multilabel
from fast_bert.learner_cls import BertLearner
metrics = [{'name': 'accuracy', 'function': accuracy_multilabel}]
learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path='bert-base-uncased',
    metrics=metrics,
    device=device_cuda,
Example #9
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="Path specifying the location of the dataset")

    parser.add_argument(
        "--label_dir",
        default=None,
        type=str,
        required=True,
        help="Path specifying the location of the labels.csv file")

    parser.add_argument(
        "--output_dir",
        default=None,
        required=True,
        type=str,
        help="Path specifying the location to save the results")

    parser.add_argument("--text_col",
                        default=None,
                        required=True,
                        type=str,
                        help="The column name of the text")

    parser.add_argument("--batch_size",
                        default=16,
                        required=False,
                        type=int,
                        help="Batch size per GPU")

    parser.add_argument(
        "--max_seq_len",
        default=320,
        required=False,
        type=int,
        help="Maximum length of the token sequence to input to BERT")

    parser.add_argument("--multi_gpu",
                        default=False,
                        required=False,
                        type=bool,
                        help="Whether to use multi-gpu for training")

    parser.add_argument("--epochs",
                        default=6,
                        type=int,
                        required=False,
                        help="Number of epochs to train")

    parser.add_argument("--lr",
                        default=6e-5,
                        type=float,
                        required=False,
                        help="Initial learning rate for training")

    parser.add_argument("--save_model",
                        required=False,
                        default=None,
                        help="Whether to save the model or not")

    parser.add_argument("--eval",
                        required=False,
                        type=bool,
                        default=True,
                        help="Whether to run evaluation after each epoch")

    args = parser.parse_args()

    DATA_PATH = args.data_dir

    LABEL_PATH = args.label_dir

    OUTPUT_PATH = args.output_dir

    EPOCHS = args.epochs

    LR = args.lr

    EVAL = args.eval

    TEXT_COL = args.text_col

    BATCH_SIZE = args.batch_size

    MAX_SEQ_LEN = args.max_seq_len

    MULTI_GPU = args.multi_gpu

    labels = pd.read_csv(os.path.join(DATA_PATH, 'labels.csv'),
                         header=None).values

    LABEL_LIST = [val[0] for val in labels]

    databunch = BertDataBunch(DATA_PATH,
                              LABEL_PATH,
                              tokenizer='bert-base-uncased',
                              train_file='m_aspect_train.csv',
                              val_file='m_aspect_test.csv',
                              label_file='labels.csv',
                              text_col=TEXT_COL,
                              label_col=LABEL_LIST,
                              batch_size_per_gpu=BATCH_SIZE,
                              max_seq_length=MAX_SEQ_LEN,
                              multi_gpu=MULTI_GPU,
                              multi_label=True,
                              model_type='bert',
                              no_cache=True)

    # display(databunch.get_dl_from_texts)

    device_cuda = torch.device("cuda")
    metrics = [{'name': 'accuracy', 'function': accuracy_multilabel}]

    learner = BertLearner.from_pretrained_model(
        databunch,
        pretrained_path='bert-base-uncased',
        metrics=metrics,
        device=device_cuda,
        logger=None,
        output_dir=OUTPUT_PATH,
        finetuned_wgts_path=None,
        warmup_steps=500,
        multi_gpu=MULTI_GPU,
        is_fp16=False,
        multi_label=True,
        logging_steps=50)

    global_step, loss = learner.fit(
        epochs=EPOCHS,
        lr=LR,
        validate=EVAL,  # Evaluate the model after each epoch
        schedule_type="warmup_cosine",
        optimizer_type="lamb")

    print("global_Step:", global_step, "loss:", loss)

    if args.save_model:
        learner.save_model()
Example #10
0
def create_model(columnm, epoch):
    if not os.path.exists(
            '/scratch/da2734/twitter/jobs/running_on_200Msamples/logs/log_binary_pos_neg_{}/'
            .format(column)):
        os.makedirs(
            '/scratch/da2734/twitter/jobs/running_on_200Msamples/logs/log_binary_pos_neg_{}/'
            .format(column))

    LOG_PATH = Path(
        '/scratch/da2734/twitter/jobs/running_on_200Msamples/logs/log_binary_pos_neg_{}/'
        .format(column))
    print('LOG_PATH', LOG_PATH)
    DATA_PATH = Path(
        '/scratch/da2734/twitter/data/may20_9Klabels/data_binary_pos_neg_balanced/'
    )
    LABEL_PATH = Path(
        '/scratch/da2734/twitter/data/may20_9Klabels/data_binary_pos_neg_balanced/'
    )
    OUTPUT_PATH = Path(
        '/scratch/da2734/twitter/jobs/training_binary/models_may20_9Klabels/output_{}'
        .format(column))
    FINETUNED_PATH = None

    args = Box({
        "run_text": "100Msamples",
        "train_size": -1,
        "val_size": -1,
        "log_path": LOG_PATH,
        "full_data_dir": DATA_PATH,
        "data_dir": DATA_PATH,
        "task_name": "labor_market_classification",
        "no_cuda": False,
        #     "bert_model": BERT_PRETRAINED_PATH,
        "output_dir": OUTPUT_PATH,
        "max_seq_length": 512,
        "do_train": True,
        "do_eval": True,
        "do_lower_case": True,
        "train_batch_size": 8,
        "eval_batch_size": 16,
        "learning_rate": 5e-5,
        "num_train_epochs": 100,
        "warmup_proportion": 0.0,
        "no_cuda": False,
        "local_rank": -1,
        "seed": 42,
        "gradient_accumulation_steps": 1,
        "optimize_on_cpu": False,
        "fp16": False,
        "fp16_opt_level": "O1",
        "weight_decay": 0.0,
        "adam_epsilon": 1e-8,
        "max_grad_norm": 1.0,
        "max_steps": -1,
        "warmup_steps": 500,
        "logging_steps": 50,
        "eval_all_checkpoints": True,
        "overwrite_output_dir": True,
        "overwrite_cache": True,
        "seed": 42,
        "loss_scale": 128,
        "task_name": 'intent',
        "model_name": 'bert-base-uncased',
        "model_type": 'bert'
    })

    import logging

    logfile = str(LOG_PATH /
                  'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        handlers=[
            logging.FileHandler(logfile),
            logging.StreamHandler(sys.stdout)
        ])

    logger = logging.getLogger()

    logger.info(args)

    device = torch.device('cuda')
    if torch.cuda.device_count() > 1:
        args.multi_gpu = True
    else:
        args.multi_gpu = False

    label_cols = ['class']

    databunch = BertDataBunch(
        args['data_dir'],
        LABEL_PATH,
        args.model_name,
        train_file='train_{}.csv'.format(column),
        val_file='val_{}.csv'.format(column),
        label_file='label_{}.csv'.format(column),
        # test_data='test.csv',
        text_col=
        "text",  # this is the name of the column in the train file that containts the tweet text
        label_col=label_cols,
        batch_size_per_gpu=args['train_batch_size'],
        max_seq_length=args['max_seq_length'],
        multi_gpu=args.multi_gpu,
        multi_label=False,
        model_type=args.model_type)

    num_labels = len(databunch.labels)
    print('num_labels', num_labels)

    print('time taken to load all this stuff:', str(time.time() - start_time),
          'seconds')

    # metrics defined: https://github.com/kaushaltrivedi/fast-bert/blob/d89e2aa01d948d6d3cdea7ad106bf5792fea7dfa/fast_bert/metrics.py
    metrics = []
    # metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
    # metrics.append({'name': 'roc_auc', 'function': roc_auc})
    # metrics.append({'name': 'fbeta', 'function': fbeta})
    metrics.append({'name': 'accuracy', 'function': accuracy})
    metrics.append({
        'name': 'roc_auc_save_to_plot_binary',
        'function': roc_auc_save_to_plot_binary
    })
    # metrics.append({'name': 'accuracy_multilabel', 'function': accuracy_multilabel})

    learner = BertLearner.from_pretrained_model(
        databunch,
        pretrained_path=
        '/scratch/da2734/twitter/jobs/training_binary/models_may20_9Klabels/output_{}/model_out_{}/'
        .format(column, epoch),
        metrics=metrics,
        device=device,
        logger=logger,
        output_dir=args.output_dir,
        finetuned_wgts_path=FINETUNED_PATH,
        warmup_steps=args.warmup_steps,
        multi_gpu=args.multi_gpu,
        is_fp16=args.fp16,
        multi_label=False,
        logging_steps=0)

    return learner
Example #11
0
from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc

label_cols = ['LN' + str(i) for i in range(1, 22)]

tokenizer = BertTokenizer(
    vocab_file='./chinese_roberta_wwm_large_ext_pytorch/vocab.txt')

databunch = BertDataBunch(data_dir='./Data/loan/new_data',
                          label_dir='./Data/loan/new_data',
                          tokenizer=tokenizer,
                          train_file='train.csv',
                          val_file='',
                          label_file='labels.csv',
                          text_col='text',
                          label_col=label_cols,
                          batch_size_per_gpu=4,
                          max_seq_length=512,
                          multi_gpu=True,
                          multi_label=True,
                          backend='nccl',
                          model_type='bert')

metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})
device_cuda = torch.device("cuda")
logger = logging.getLogger()

learner = BertLearner.from_pretrained_model(
run_start_time = datetime.datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
logfile = str("log/log-{}-{}.txt".format(run_start_time, "bert_base_uncased"))

logger = logging.getLogger()

device_cuda = torch.device("cuda")
metrics = [{"name": "Acc", "function": accuracy}]

databunch = BertDataBunch(
    data_path,
    label_path,
    tokenizer=tokenizer,
    train_file="train_combined.csv",
    val_file=val_file,
    label_file="labels.csv",
    text_col="text",
    label_col="label",
    batch_size_per_gpu=32,
    max_seq_length=128,
    multi_gpu=False,
    multi_label=False,
    model_type="bert",
)

learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path=pretrained_path,
    metrics=metrics,
    device=device_cuda,
    logger=logger,
    output_dir=out_path,
from fast_bert.data_cls import BertDataBunch
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy
import logging

PRETRAINED_PATH = '../../bert_model/model_out/'

tokenizer = BertTokenizer.from_pretrained(PRETRAINED_PATH, do_lower_case=False)

databunch = BertDataBunch('../../../dados/',
                          '../../../dados/',
                          tokenizer,
                          train_file='bert_train_pt.csv',
                          val_file='bert_val_pt.csv',
                          label_file='labels.csv',
                          text_col='text',
                          label_col='label',
                          batch_size_per_gpu=768,
                          max_seq_length=15,
                          multi_gpu=False,
                          multi_label=False,
                          model_type='bert')

OUTPUT_DIR = '../../bert_model/'
logger = logging.getLogger()
device_cuda = torch.device("cuda")
metrics = [{'name': 'accuracy', 'function': accuracy}]

learner = BertLearner.from_pretrained_model(databunch,
                                            pretrained_path=PRETRAINED_PATH,
                                            metrics=metrics,
from fast_bert.data_cls import BertDataBunch

DATA_PATH = "data"
LABEL_PATH = "label"

databunch = BertDataBunch(DATA_PATH, LABEL_PATH,
                          tokenizer='bert-base-uncased',
                          train_file='train_sample.csv',
                          val_file='val_sample.csv',
                          label_file='labels.csv',
                          text_col='text',
                          label_col=[	'location',
										'chat',
										'time',
										'personal',
										'camera',
										'app',
										],
                          batch_size_per_gpu=1,
                          max_seq_length=512,
                          multi_gpu=False,
                          multi_label=True,
                          model_type='bert')


import torch
from fast_bert.learner_cls import BertLearner
from fast_bert.metrics import accuracy
import logging

logger = logging.getLogger()
Example #15
0
              'log-{}-{}.txt'.format(run_start_time, 'doc_rerank_bioasq7'))
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[logging.FileHandler(logfile),
              logging.StreamHandler(sys.stdout)])
logger = logging.getLogger()

databunch = BertDataBunch(
    DATA_PATH,
    LABEL_PATH,
    tokenizer='bert-base-uncased',
    train_file='/home/dpappas/fast_bert_models/doc_rerank/train.csv',
    val_file='/home/dpappas/fast_bert_models/doc_rerank/val.csv',
    label_file='/home/dpappas/fast_bert_models/doc_rerank/labels.csv',
    text_col='text',
    label_col='label',
    batch_size_per_gpu=6,
    max_seq_length=512,
    multi_gpu=multi_gpu,
    multi_label=False,
    model_type='bert')

metrics = []
metrics.append({'name': 'accuracy', 'function': accuracy})
metrics.append({'name': 'roc_auc', 'function': roc_auc_2})
metrics.append({'name': 'fbeta', 'function': fbeta_2})

learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path='bert-base-uncased',
Example #16
0
def train_bert(experiment_parameters, args):

    # logging
    run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')
    logfile = str(experiment_parameters.LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        handlers=[
            logging.FileHandler(logfile),
            logging.StreamHandler(sys.stdout)
        ])

    logger = logging.getLogger()

    # cuda
    device = torch.device('cuda')
    if torch.cuda.device_count() > 1:
        args.multi_gpu = True
    else:
        args.multi_gpu = False

    print()
    print('BERT training file: ',args['data_dir'],'train.csv')

    # create a fast-bert-specific data format
    torch.manual_seed(args.seed)
    databunch = BertDataBunch(args['data_dir'], experiment_parameters.LABEL_PATH,
                              experiment_parameters.tokenizer,
                              train_file='train.csv',
                              val_file=None,#'test.csv',
                              test_data='test.csv',
                              text_col="comment_text", label_col=experiment_parameters.LABEL_COLS,
                              batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'],
                              multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type, clear_cache=False)

    metrics = []
    metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
    metrics.append({'name': 'roc_auc', 'function': roc_auc})
    metrics.append({'name': 'fbeta', 'function': fbeta})

    # create learner object
    learner = BertLearner.from_pretrained_model(databunch, args.model_name, metrics=metrics,
                                            device=device, logger=logger, output_dir=args.output_dir,
                                            finetuned_wgts_path=experiment_parameters.FINETUNED_PATH,
                                            warmup_steps=args.warmup_steps,
                                            multi_gpu=args.multi_gpu, is_fp16=args.fp16,
                                            multi_label=True, logging_steps=0)

    # train
    torch.manual_seed(args.seed)
    learner.fit(args.num_train_epochs, args.learning_rate, validate=False)

    # save
    learner.save_model()

    # free memory and exit
    del learner
    return
Example #17
0
import sys

import logging
logging.basicConfig(level=logging.DEBUG)

DATA_PATH = sys.argv[1]
LABEL_PATH = sys.argv[2]
OUTPUT_DIR = sys.argv[3]

databunch = BertDataBunch(DATA_PATH,
                          LABEL_PATH,
                          tokenizer='roberta-base',
                          train_file='train.csv',
                          val_file='valid.csv',
                          label_file='labels.csv',
                          text_col='text',
                          label_col='label',
                          batch_size_per_gpu=64,
                          max_seq_length=50,
                          multi_gpu=False,
                          multi_label=False,
                          model_type='roberta')

logger = logging.getLogger()
device_cuda = torch.device("cuda")
metrics = [{'name': 'accuracy', 'function': accuracy}]

learner = BertLearner.from_pretrained_model(databunch,
                                            pretrained_path='roberta-base',
                                            metrics=metrics,
                                            device=device_cuda,
Example #18
0
logger = logging.getLogger()
device_cuda = torch.device("cuda")
metrics = [{"name": "accuracy", "function": accuracy}]

databunch = BertDataBunch(
    DATA_PATH,
    LABEL_PATH,
    tokenizer="bert-base-uncased",
    train_file="train.csv",
    val_file="val.csv",
    label_file="labels.csv",
    text_col="text",
    label_col=[
        "domestic",
        "county",
        "city",
        "regional",
        "state",
        "national",
        "international",
        "is_relevant",
    ],
    batch_size_per_gpu=1,
    max_seq_length=2,
    multi_gpu=False,
    multi_label=True,
    model_type="bert",
)

learner = BertLearner.from_pretrained_model(
    databunch,
    pretrained_path="bert-base-uncased",
Example #19
0
    args.multi_gpu = False

label_cols = [
    "job_loss", "is_unemployed", "job_search", "is_hired", "job_offer"
]
# label_cols = ['pos', 'neg']
# label_cols = ['pos']

databunch = BertDataBunch(
    args['data_dir'],
    LABEL_PATH,
    args.model_name,
    train_file='train.csv',
    val_file='val.csv',
    label_file='labels.csv',
    # test_data='test.csv',
    text_col=
    "text",  # this is the name of the column in the train file that containts the tweet text
    label_col=label_cols,
    batch_size_per_gpu=args['train_batch_size'],
    max_seq_length=args['max_seq_length'],
    multi_gpu=args.multi_gpu,
    multi_label=True,
    model_type=args.model_type)

num_labels = len(databunch.labels)
print('num_labels', num_labels)

print('time taken to load all this stuff:', str(time.time() - start_time),
      'seconds')

# metrics defined: https://github.com/kaushaltrivedi/fast-bert/blob/d89e2aa01d948d6d3cdea7ad106bf5792fea7dfa/fast_bert/metrics.py
Example #20
0
######################


DATA_PATH = Path('./sample_data/multi_label_toxic_comments/data/')     # path for data files (train and val)
LABEL_PATH = Path('./sample_data/multi_label_toxic_comments/label/')  # path for labels file
MODEL_PATH=Path('../models/')    # path for model artifacts to be stored
LOG_PATH=Path('../logs/')       # path for log files to be stored



databunch = BertDataBunch(DATA_PATH, LABEL_PATH,
                          tokenizer='bert-base-uncased',
                          train_file='train.csv',
                          val_file='val.csv',
                          label_file='labels.csv',
                          text_col='text',
                          label_col=['toxic','severe_toxic','obscene','threat','insult','identity_hate'],
                          batch_size_per_gpu=8,
                          max_seq_length=512,
                          multi_gpu=True,
                          multi_label=True,
                          model_type='bert')




######################
###                ###
###    LEARNER     ###
###                ###
######################
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[logging.FileHandler(logfile),
              logging.StreamHandler(sys.stdout)])

logger = logging.getLogger()
logger.info(args)

databunch = BertDataBunch(args['data_dir'],
                          LABEL_PATH,
                          args.model_name,
                          train_file='train.csv',
                          val_file='val.csv',
                          test_data='test.csv',
                          text_col="comment_text",
                          label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'],
                          max_seq_length=args['max_seq_length'],
                          multi_gpu=args.multi_gpu,
                          multi_label=True,
                          model_type=args.model_type)

print(databunch.train_dl.dataset[0][3])
num_labels = len(databunch.labels)
print(num_labels)

metrics = []
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})
def train_fast_bert():

    MAX_LEN = 512  # previous model was 300

    text_col = 'script'
    label_col = [
        'Action', 'Adventure', 'Comedy', 'Crime', 'Drama', 'Fantasy', 'Horror',
        'Mystery', 'Romance', 'Sci-Fi', 'Thriller'
    ]
    DATA_PATH = Path('./data/')
    LABEL_PATH = DATA_PATH

    train_file = 'fast_train_' + str(MAX_LEN) + '.csv'
    val_file = 'fast_val_' + str(MAX_LEN) + '.csv'

    goodtogo = check_fastBert_data(MAX_LEN)
    if not goodtogo: die()

    MODEL_NAME = 'bert-base-uncased'

    databunch = BertDataBunch(DATA_PATH,
                              LABEL_PATH,
                              tokenizer=MODEL_NAME,
                              train_file=train_file,
                              val_file=val_file,
                              label_file='fast_labels.csv',
                              text_col=text_col,
                              label_col=label_col,
                              batch_size_per_gpu=16,
                              max_seq_length=MAX_LEN,
                              multi_gpu=False,
                              multi_label=True,
                              model_type='bert')

    # **NOTE** remember to change `usePretrained` to True if we've already have a fine-tuned model

    def my_accuracy_thresh(
        y_pred: Tensor,
        y_true: Tensor,
        thresh: float = 0.7,
        sigmoid: bool = False,
    ):
        "Compute accuracy when `y_pred` and `y_true` are the same size."
        if sigmoid:
            y_pred = y_pred.sigmoid()
        return ((y_pred > thresh) == y_true.bool()).float().mean().item()

    logging.basicConfig(level=logging.NOTSET)
    logger = logging.getLogger()
    device_cuda = torch.device("cuda")
    metrics = [{'name': 'accuracy_thresh', 'function': my_accuracy_thresh}]

    OUTPUTDIR = Path('./models/')

    MODEL_PATH = OUTPUTDIR / 'model_out_bert_cased'

    usePretrained = False
    if usePretrained:
        pretrained_path = MODEL_PATH
    else:
        pretrained_path = 'bert-base-uncased'

    # Setting up apex properly on Colab required dowgrading Torch version (check first block of notebook for details)
    learner = BertLearner.from_pretrained_model(
        databunch,
        pretrained_path=usePretrained,  #MODEL_PATH #(to use saved model)
        metrics=metrics,
        device=device_cuda,
        logger=logger,
        output_dir=OUTPUTDIR,
        finetuned_wgts_path=None,
        warmup_steps=500,
        multi_gpu=False,
        is_fp16=False,  # need apex setup properly for this (note above)
        multi_label=True,
        logging_steps=50)

    learner.fit(
        epochs=5,
        lr=6e-4,
        validate=True,  # Evaluate the model after each epoch
        schedule_type="warmup_cosine",
        optimizer_type="lamb")