def get_data(): """Load official THYME data""" xml_regex = cfg.get('data', 'xml_regex') train_xml_dir = os.path.join(base, cfg.get('data', 'train_xml')) train_text_dir = os.path.join(base, cfg.get('data', 'train_text')) dev_xml_dir = os.path.join(base, cfg.get('data', 'dev_xml')) dev_text_dir = os.path.join(base, cfg.get('data', 'dev_text')) train_data = dtrdata.DTRData(train_xml_dir, train_text_dir, xml_regex, cfg.getint('args', 'context_chars')) dev_data = dtrdata.DTRData(dev_xml_dir, dev_text_dir, xml_regex, cfg.getint('args', 'context_chars')) x_train, y_train = train_data() x_dev, y_dev = dev_data() vectorizer = TfidfVectorizer(tokenizer=tokenizer, token_pattern=None, ngram_range=(1, 3)) x_train = vectorizer.fit_transform(x_train) x_dev = vectorizer.transform(x_dev) return x_train, y_train, x_dev, y_dev
def main(): """Fine-tune bert""" train_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'xmi_dir')), partition='train', n_files=cfg.get('data', 'n_files')) tr_texts, tr_labels = train_data.read() train_loader = utils.make_data_loader(tr_texts, tr_labels, cfg.getint('model', 'batch_size'), None, 'train', utils.to_lstm_inputs) val_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'xmi_dir')), partition='dev', n_files=cfg.get('data', 'n_files')) val_texts, val_labels = val_data.read() val_loader = utils.make_data_loader(val_texts, val_labels, cfg.getint('model', 'batch_size'), None, 'dev', utils.to_lstm_inputs) model = LstmClassifier() label_counts = torch.bincount(torch.IntTensor(tr_labels)) weights = len(tr_labels) / (2.0 * label_counts) train(model, train_loader, val_loader, weights) evaluate(model, val_loader, weights)
def main(): """Fine-tune bert""" train_data = dtrdata.DTRData( os.path.join(base, cfg.get('data', 'xmi_dir')), partition='train', n_files=cfg.get('data', 'n_files')) tr_texts, tr_labels = train_data.read() train_loader = make_data_loader(tr_texts, tr_labels, RandomSampler) val_data = dtrdata.DTRData( os.path.join(base, cfg.get('data', 'xmi_dir')), partition='dev', n_files=cfg.get('data', 'n_files')) val_texts, val_labels = val_data.read() val_loader = make_data_loader(val_texts, val_labels, SequentialSampler) model = BertClassifier.from_pretrained( 'bert-base-uncased', num_labels=2) label_counts = torch.bincount(torch.IntTensor(tr_labels)) weights = len(tr_labels) / (2.0 * label_counts) print('class weights:', weights) train(model, train_loader, val_loader, weights) evaluate(model, val_loader, weights)
def main(): """Fine-tune bert""" train_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'xmi_dir')), partition='train', n_files=cfg.get('data', 'n_files')) tr_texts, tr_labels = train_data.read() train_loader = utils.make_data_loader(tr_texts, tr_labels, cfg.getint('model', 'batch_size'), cfg.getint('data', 'max_len'), 'train', utils.to_token_id_sequences) val_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'xmi_dir')), partition='dev', n_files=cfg.get('data', 'n_files')) val_texts, val_labels = val_data.read() val_loader = utils.make_data_loader(val_texts, val_labels, cfg.getint('model', 'batch_size'), cfg.getint('data', 'max_len'), 'dev', utils.to_token_id_sequences) model = BagOfEmbeddings() label_counts = torch.bincount(torch.IntTensor(tr_labels)) weights = len(tr_labels) / (2.0 * label_counts) train(model, train_loader, val_loader, weights) evaluate(model, val_loader, weights)
def main(): """Fine-tune bert""" model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4) if torch.cuda.is_available(): device = torch.device('cuda') model.cuda() else: device = torch.device('cpu') model.cpu() optimizer, scheduler = make_optimizer_and_scheduler(model) train_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'xmi_dir'))) train_loader = make_data_loader(train_data, RandomSampler) for epoch in trange(cfg.getint('bert', 'num_epochs'), desc='epoch'): model.train() train_loss, num_train_examples, num_train_steps = 0, 0, 0 for step, batch in enumerate(train_loader): batch = tuple(t.to(device) for t in batch) batch_inputs, batch_masks, batch_labels = batch optimizer.zero_grad() loss, logits = model(batch_inputs, attention_mask=batch_masks, labels=batch_labels) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() train_loss += loss.item() num_train_examples += batch_inputs.size(0) num_train_steps += 1 print('epoch: %d, loss: %.4f' % (epoch, train_loss / num_train_steps)) dev_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'xmi_dir')), partition='dev', xml_ref_dir=os.path.join( base, cfg.get('data', 'ref_xml_dir')), xml_out_dir=cfg.get('data', 'out_xml_dir')) dev_loader = make_data_loader(dev_data, sampler=SequentialSampler) predictions = evaluate(model, dev_loader, device) dev_data.write(predictions)
def main(): """Fine-tune bert""" train_data = dtrdata.DTRData( os.path.join(base, cfg.get('data', 'xmi_dir')), partition='train', n_files=cfg.get('data', 'n_files')) tr_texts, tr_labels = train_data.read() train_loader = utils.make_data_loader( tr_texts, tr_labels, cfg.getint('model', 'batch_size'), cfg.getint('data', 'max_len'), 'train', utils.to_transformer_inputs) val_data = dtrdata.DTRData( os.path.join(base, cfg.get('data', 'xmi_dir')), partition='dev', n_files=cfg.get('data', 'n_files')) val_texts, val_labels = val_data.read() val_loader = utils.make_data_loader( val_texts, val_labels, cfg.getint('model', 'batch_size'), cfg.getint('data', 'max_len'), 'dev', utils.to_transformer_inputs) print('loaded %d training and %d validation samples' % \ (len(tr_texts), len(val_texts))) model = TransformerClassifier() label_counts = torch.bincount(torch.IntTensor(tr_labels)) weights = len(tr_labels) / (2.0 * label_counts) train(model, train_loader, val_loader, weights) evaluate(model, val_loader, weights, suppress_output=False)
def main(): """Fine-tune bert""" device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4) if torch.cuda.is_available(): model.cuda() else: model.cpu() # this is still a mystery to me no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() \ if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() \ if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.getfloat('bert', 'lr'), eps=1e-8) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=1000) train_data = dtrdata.DTRData( os.path.join(base, cfg.get('data', 'train_xml')), os.path.join(base, cfg.get('data', 'train_text')), cfg.get('data', 'xml_regex'), cfg.get('data', 'out_dir'), cfg.getint('args', 'context_chars'), cfg.getint('bert', 'max_len')) train_loader = make_data_loader(train_data, sampler=RandomSampler) for epoch in trange(cfg.getint('bert', 'num_epochs'), desc='epoch'): model.train() train_loss, num_train_examples, num_train_steps = 0, 0, 0 for step, batch in enumerate(train_loader): batch = tuple(t.to(device) for t in batch) batch_inputs, batch_masks, batch_labels = batch optimizer.zero_grad() loss, logits = model(batch_inputs, attention_mask=batch_masks, labels=batch_labels) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() train_loss += loss.item() num_train_examples += batch_inputs.size(0) num_train_steps += 1 print('epoch: %d, loss: %.4f' % (epoch, train_loss / num_train_steps)) dev_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'dev_xml')), os.path.join(base, cfg.get('data', 'dev_text')), cfg.get('data', 'xml_regex'), cfg.get('data', 'out_dir'), cfg.getint('args', 'context_chars'), cfg.getint('bert', 'max_len')) dev_loader = make_data_loader(dev_data, sampler=SequentialSampler) predictions = evaluate(model, dev_loader, device) dev_data.write(predictions)