def main(): data_cfgs = config.data.cfgs if config.data.name is None else [None] model_cfgs = config.model.cfgs if config.model.name is None else [None] for data_cfg in data_cfgs: for model_cfg in model_cfgs: print('\nExporting predictions for {}'.format(model_cfg)) reset_config() config.add('util/export') if model_cfg is not None: config.add(model_cfg) if data_cfg is not None: config.add(data_cfg) out_path = config.get_path('output') if os.path.exists(out_path): final_path = os.path.join(out_path, 'model') if not config.force and os.path.exists( final_path) and os.path.islink(final_path): print(f'\n{final_path} already exists. skipping') continue summarize(out_path) else: print(out_path, 'does not exist') print('')
def main(): data_cfgs = config.data.cfgs if config.data.name is None else [None] model_cfgs = config.model.cfgs if config.model.name is None else [None] for data_cfg in data_cfgs: print('') for model_cfg in model_cfgs: reset_config() config.add('util/export', silent=True) if model_cfg is not None: config.add(model_cfg, silent=True) if data_cfg is not None: config.add(data_cfg, silent=True) out_path = config.get_path('output') stats_path = os.path.join(out_path, 'model', 'stats.json') if not os.path.exists(stats_path): print(f'{stats_path} does not exist. skipping') continue with open(stats_path) as f: stats = json.load(f) dev = round(stats["dev_score"], 3) test = round(stats["test_score"], 3) print(f'{data_cfg:<20}\t{model_cfg}\t{stats["best_checkpoint"]}\t{dev}\t{test}')
def summarize(output_path): suffix = '-{}.tsv'.format(config.summary.type) dev_outputs = sorted([ f for f in os.listdir(config.get_path('output')) if f.endswith(suffix) ]) if config.summary.method == "count": last = dev_outputs[-1] if len(dev_outputs) > 0 else 'does not exist' print(last) return best_filename = None scores = [] max_score = 0 for filename in dev_outputs: filepath = os.path.join(output_path, filename) score = score_file(filepath) suffix = 'BEST' if score >= max_score else '' print('{}: {:2.4f} {}'.format(filename, score, suffix)) if score >= max_score: max_score = score best_filename = filename scores.append((filename, score)) # if config.summary.method.endswith('cat'): if best_filename is not None: best_filepath = os.path.join(output_path, best_filename) print('\nBEST: {} [{}]\n'.format(best_filepath, max_score)) if config.summary.method != "groupacc": reportcat(best_filepath) labels = sorted(label_set(os.path.join(output_path, best_filename))) test_score = score_file( os.path.join(output_path, best_filename.replace('-dev', '-test'))) stats = { 'best_checkpoint': int(best_filename[:3]), 'dev_score': max_score, 'test_score': test_score, 'dev_scores': scores } finalize_checkpoint(output_path, best_filename[:3], stats, labels)
def export(): out_path = config.get_path('output') if not os.path.exists(out_path): print('output dir does not exist. skipping') return checkpoints = sorted([ int(d[-3:]) for d in os.listdir(out_path) if d.startswith('checkpoint-') ]) for ckpt in checkpoints: test_path = os.path.join(out_path, str(ckpt).zfill(3) + '-test.tsv') if os.path.exists(test_path): print('skipping, export already exists') continue config.add('model.checkpoint', ckpt) run()
def load_model(): model_path = config.model.name state = None if config.model.checkpoint is not False: checkpoint_path = os.path.join(config.get_path('output'), 'checkpoint-*') checkpoints = sorted(glob(checkpoint_path)) if len(checkpoints) > 0: n = config.model.checkpoint model_path = checkpoints[-1] if n < 0 else checkpoint_path.replace( '*', str(n).zfill(3)) print('Loading checkpoint from "{}"'.format(model_path)) state_path = os.path.join(model_path, "state.pt") if os.path.exists(state_path): try: state = torch.load(state_path, map_location=config.model.device) except Exception: print('WARNING: could not load state dict') elif config.model.do_train: raise Exception( 'attempting to resume training from {}, but state.pt is missing' .format(model_path)) if config.model.type == 'roberta': clf = RobertaForTokenClassification if config.data.token_level else RobertaForSequenceClassification else: clf = BertForTokenClassification if config.data.token_level else BertForSequenceClassification model = clf.from_pretrained( model_path, num_labels=config.data.num_labels, attention_probs_dropout_prob=config.train.attention_dropout, hidden_dropout_prob=config.train.hidden_dropout) model.to(config.model.device) return model, state
def save_checkpoint(model, optimizer, scheduler, epoch, global_step): out_path = config.get_path('output') checkpoint_dir = os.path.join(out_path, 'checkpoint-{}'.format(str(epoch).zfill(3))) os.makedirs(checkpoint_dir, exist_ok=True) model.save_pretrained(checkpoint_dir) torch.save( { 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'epoch': epoch, 'global_step': global_step, 'random_state': torch.random.get_rng_state() }, os.path.join(checkpoint_dir, "state.pt")) # Delete previous state if epoch > 0: prev_state_path = os.path.join( out_path, 'checkpoint-{}'.format(str(epoch - 1).zfill(3)), "state.pt") os.remove(prev_state_path)
def main(): config.show() if config.model.name is None: print('provide config model name') exit(0) if config.data.input is None: print('provide task data') exit(0) print('Loading tokenizer "{}"'.format(config.model.name)) Tokenizer = RobertaTokenizer if config.model.type == 'roberta' else BertTokenizer tokenizer = Tokenizer.from_pretrained(config.model.name, do_lower_case=False) cache_dir = config.get_path('cache') train_dataset, label_map = load_data(config.data.input, 'train.tsv', tokenizer, cfg=config.data, cache_dir=cache_dir) print('Train data: {} examples, {} labels: {}'.format( len(train_dataset), len(label_map), list(label_map.keys()))) dev_dataset = None if config.data.dev: dev_dataset, _ = load_data(config.data.input, 'dev.tsv', tokenizer, label_map, cfg=config.data, cache_dir=cache_dir) print('Dev data: {} examples'.format(len(dev_dataset))) print('Loading model "{}"'.format(config.model.name)) model, state = load_model() if config.model.do_train: print('Start training') train(model, train_dataset, dev_dataset, state) if config.model.do_export: test_dataset, _ = load_data(config.data.input, 'test.tsv', tokenizer, label_map, cfg=config.data, cache_dir=cache_dir) # print('\nExporting train:') # export(model, train_dataset, label_map, 'train.tsv') if dev_dataset is not None: print('Exporting dev') export(model, dev_dataset, label_map, 'dev.tsv') print('Exporting test') export(model, test_dataset, label_map, 'test.tsv') print('\nDone!')
def export(model, dataset, label_map, filename): dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=config.eval.batch_size) result = evaluate(model, dataloader, return_acc=False, return_labels=True, return_probs=config.summary.probs) groups = None if config.summary.groups: grouped_sents, _ = read_examples(os.path.join(config.data.input, filename), add_labels=2) sents = [[s[0] for s in ex] for ex in grouped_sents] groups = [[s[1] for s in ex] for ex in grouped_sents] else: sents = read_examples(os.path.join(config.data.input, filename), add_labels=False) label_names = sorted(label_map, key=label_map.get) labels_true = result['labels_true'] labels_pred = result['labels_pred'] sent_ids = result['sent_ids'] pred_probs = result['pred_probs'] if config.summary.probs else [ None ] * len(sent_ids) if config.model.checkpoint >= 0: filename = str(config.model.checkpoint).zfill(3) + '-' + filename out_path = os.path.join(config.get_path('output'), filename) with open(out_path, 'w') as f: prev_sent_id = 0 token_id = 0 for label_true, label_pred, sent_id, pred_prob in zip( labels_true, labels_pred, sent_ids, pred_probs): if sent_id != prev_sent_id: if config.data.token_level: f.write('\n') prev_sent_id = sent_id token_id = 0 true, pred = label_names[label_true], label_names[label_pred] if token_id >= len(sents[sent_id]): print('skipping sent={} token={} true={} pred={}'.format( sent_id, token_id, true, pred)) continue token = sents[sent_id][token_id] if groups is None else groups[ sent_id][token_id] out = [token, true, pred] if config.summary.probs: out.append(str(pred_prob.item())) f.write('\t'.join(out) + '\n') token_id += 1 print('Predictions are exported to {}'.format(out_path))
def train(model, train_dataset, dev_dataset=None, state=None): writer = SummaryWriter(config.get_path('logs')) train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=config.train.batch_size) if dev_dataset is not None: dev_dataloader = DataLoader(dev_dataset, sampler=SequentialSampler(dev_dataset), batch_size=config.eval.batch_size) optimizer, scheduler = prepare_optimizer(model, train_dataloader) torch.random.manual_seed(config.train.seed) model.zero_grad() # Get step intervals gradient_steps = config.train.gradient_accumulation_steps logging_steps = config.train.logging_steps if logging_steps < 1: logging_steps = int( len(train_dataloader) // gradient_steps * logging_steps) if logging_steps == 0: logging_steps = 1 eval_steps = config.train.eval_steps if eval_steps < 1: eval_steps = int(len(train_dataloader) // gradient_steps * eval_steps) print('Global step intervals: Logging={} Eval={}'.format( logging_steps, eval_steps)) current_epoch = 0 global_step = 0 tr_loss, logging_loss = 0.0, 0.0 # Restore previous checkpoint if state is not None: torch.random.set_rng_state(state['random_state'].cpu()) optimizer.load_state_dict(state['optimizer']) scheduler.load_state_dict(state['scheduler']) current_epoch = state['epoch'] + 1 global_step = state['global_step'] print('Starting at epoch {}'.format(current_epoch)) for epoch in range(current_epoch, config.train.max_epochs): print(' > Start epoch {}/{}'.format(epoch, config.train.max_epochs)) n_correct, n_total = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Batch", disable=not config.verbose)): model.train() inputs, true_labels, label_mask = prepare_batch(batch) outputs = model(**inputs) loss, out = outputs[:2] loss.backward() tr_loss += loss.item() pred_labels = out.argmax(-1) pred_labels = pred_labels.reshape(*true_labels.shape) n_correct += (label_mask * (pred_labels == true_labels)).sum().item() n_total += label_mask.sum().item( ) if config.data.token_level else true_labels.shape[0] # print(n_correct, n_total, true_labels.shape, pred_labels.shape, (pred_labels == true_labels).shape) assert n_correct <= n_total if (step + 1) % gradient_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), config.train.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % logging_steps == 0: lr = scheduler.get_lr()[0] loss = (tr_loss - logging_loss) / logging_steps acc = n_correct / n_total tqdm.write( 'Epoch={} Step={} lr={:.9f} loss={:.3f} acc={:.3f}'. format(epoch, global_step, lr, loss, acc)) writer.add_scalar("Learning Rate", lr, global_step) writer.add_scalar("Loss/Train", loss, global_step) writer.add_scalar("Accuracy/Train", acc, global_step) logging_loss = tr_loss # Save checkpoint save_checkpoint(model, optimizer, scheduler, epoch, global_step) # Evaluation if dev_dataset is not None: eval_result = evaluate(model, dev_dataloader) writer.add_scalar("Loss/Eval", eval_result['loss'], epoch) writer.add_scalar("Accuracy/Eval", eval_result['acc'], epoch) tqdm.write('Evaluation: Epoch={} loss={:.3f} acc={:.3f}'.format( epoch, eval_result['loss'], eval_result['acc'])) writer.close()
import numpy as np import pandas as pd import sys sys.path.append("./") sys.path.append("../") import utils.config as cfg path_to_lookup = cfg.get_path("data/raw/IDs_mapping.csv") path_to_raw = cfg.get_path_to_raw_data() path_to_processed = cfg.get_path_to_processed_data() def process_raw_data(rawdata_path, output_path): df = pd.read_csv(rawdata_path, sep=",") processed = process_dataframe(df) processed.to_csv(output_path, index=False, header=True) def process_dataframe(df): # Only shape the target if it is present if "readmitted" in df: df["readmitted"] = np.where(df["readmitted"] == "NO", 0, 1) # Get the encoding lookup data lookup = pd.read_csv(path_to_lookup, sep=",") d = lookup.set_index('admission_type_id')['description'].to_dict() # Before we apply it we need the codes to be strings df['admission_type_id'] = np.vectorize(str)(df['admission_type_id'])