Example #1
0
def main():
    data_cfgs = config.data.cfgs if config.data.name is None else [None]
    model_cfgs = config.model.cfgs if config.model.name is None else [None]

    for data_cfg in data_cfgs:
        for model_cfg in model_cfgs:
            print('\nExporting predictions for {}'.format(model_cfg))
            reset_config()
            config.add('util/export')
            if model_cfg is not None:
                config.add(model_cfg)
            if data_cfg is not None:
                config.add(data_cfg)

            out_path = config.get_path('output')
            if os.path.exists(out_path):
                final_path = os.path.join(out_path, 'model')
                if not config.force and os.path.exists(
                        final_path) and os.path.islink(final_path):
                    print(f'\n{final_path} already exists. skipping')
                    continue
                summarize(out_path)
            else:
                print(out_path, 'does not exist')
            print('')
Example #2
0
def main():
    data_cfgs = config.data.cfgs if config.data.name is None else [None]
    model_cfgs = config.model.cfgs if config.model.name is None else [None]

    for data_cfg in data_cfgs:
        print('')
        for model_cfg in model_cfgs:
            reset_config()
            config.add('util/export', silent=True)
            if model_cfg is not None:
                config.add(model_cfg, silent=True)
            if data_cfg is not None:
                config.add(data_cfg, silent=True)

            out_path = config.get_path('output')
            stats_path = os.path.join(out_path, 'model', 'stats.json')
            if not os.path.exists(stats_path):
                print(f'{stats_path} does not exist. skipping')
                continue

            with open(stats_path) as f:
                stats = json.load(f)
            dev = round(stats["dev_score"], 3)
            test = round(stats["test_score"], 3)
            print(f'{data_cfg:<20}\t{model_cfg}\t{stats["best_checkpoint"]}\t{dev}\t{test}')
Example #3
0
def summarize(output_path):
    suffix = '-{}.tsv'.format(config.summary.type)
    dev_outputs = sorted([
        f for f in os.listdir(config.get_path('output')) if f.endswith(suffix)
    ])

    if config.summary.method == "count":
        last = dev_outputs[-1] if len(dev_outputs) > 0 else 'does not exist'
        print(last)
        return

    best_filename = None

    scores = []
    max_score = 0
    for filename in dev_outputs:
        filepath = os.path.join(output_path, filename)
        score = score_file(filepath)

        suffix = 'BEST' if score >= max_score else ''
        print('{}: {:2.4f} {}'.format(filename, score, suffix))

        if score >= max_score:
            max_score = score
            best_filename = filename
        scores.append((filename, score))

    # if config.summary.method.endswith('cat'):
    if best_filename is not None:
        best_filepath = os.path.join(output_path, best_filename)
        print('\nBEST: {} [{}]\n'.format(best_filepath, max_score))
        if config.summary.method != "groupacc":
            reportcat(best_filepath)

        labels = sorted(label_set(os.path.join(output_path, best_filename)))
        test_score = score_file(
            os.path.join(output_path, best_filename.replace('-dev', '-test')))
        stats = {
            'best_checkpoint': int(best_filename[:3]),
            'dev_score': max_score,
            'test_score': test_score,
            'dev_scores': scores
        }
        finalize_checkpoint(output_path, best_filename[:3], stats, labels)
Example #4
0
def export():
    out_path = config.get_path('output')

    if not os.path.exists(out_path):
        print('output dir does not exist. skipping')
        return

    checkpoints = sorted([
        int(d[-3:]) for d in os.listdir(out_path)
        if d.startswith('checkpoint-')
    ])

    for ckpt in checkpoints:
        test_path = os.path.join(out_path, str(ckpt).zfill(3) + '-test.tsv')
        if os.path.exists(test_path):
            print('skipping, export already exists')
            continue

        config.add('model.checkpoint', ckpt)
        run()
Example #5
0
def load_model():
    model_path = config.model.name
    state = None

    if config.model.checkpoint is not False:
        checkpoint_path = os.path.join(config.get_path('output'),
                                       'checkpoint-*')
        checkpoints = sorted(glob(checkpoint_path))
        if len(checkpoints) > 0:
            n = config.model.checkpoint
            model_path = checkpoints[-1] if n < 0 else checkpoint_path.replace(
                '*',
                str(n).zfill(3))
            print('Loading checkpoint from "{}"'.format(model_path))

            state_path = os.path.join(model_path, "state.pt")
            if os.path.exists(state_path):
                try:
                    state = torch.load(state_path,
                                       map_location=config.model.device)
                except Exception:
                    print('WARNING: could not load state dict')
            elif config.model.do_train:
                raise Exception(
                    'attempting to resume training from {}, but state.pt is missing'
                    .format(model_path))

    if config.model.type == 'roberta':
        clf = RobertaForTokenClassification if config.data.token_level else RobertaForSequenceClassification
    else:
        clf = BertForTokenClassification if config.data.token_level else BertForSequenceClassification

    model = clf.from_pretrained(
        model_path,
        num_labels=config.data.num_labels,
        attention_probs_dropout_prob=config.train.attention_dropout,
        hidden_dropout_prob=config.train.hidden_dropout)
    model.to(config.model.device)
    return model, state
Example #6
0
def save_checkpoint(model, optimizer, scheduler, epoch, global_step):
    out_path = config.get_path('output')
    checkpoint_dir = os.path.join(out_path,
                                  'checkpoint-{}'.format(str(epoch).zfill(3)))
    os.makedirs(checkpoint_dir, exist_ok=True)

    model.save_pretrained(checkpoint_dir)
    torch.save(
        {
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict(),
            'epoch': epoch,
            'global_step': global_step,
            'random_state': torch.random.get_rng_state()
        }, os.path.join(checkpoint_dir, "state.pt"))

    # Delete previous state
    if epoch > 0:
        prev_state_path = os.path.join(
            out_path, 'checkpoint-{}'.format(str(epoch - 1).zfill(3)),
            "state.pt")
        os.remove(prev_state_path)
Example #7
0
def main():
    config.show()

    if config.model.name is None:
        print('provide config model name')
        exit(0)

    if config.data.input is None:
        print('provide task data')
        exit(0)

    print('Loading tokenizer "{}"'.format(config.model.name))
    Tokenizer = RobertaTokenizer if config.model.type == 'roberta' else BertTokenizer
    tokenizer = Tokenizer.from_pretrained(config.model.name,
                                          do_lower_case=False)

    cache_dir = config.get_path('cache')

    train_dataset, label_map = load_data(config.data.input,
                                         'train.tsv',
                                         tokenizer,
                                         cfg=config.data,
                                         cache_dir=cache_dir)
    print('Train data: {} examples, {} labels: {}'.format(
        len(train_dataset), len(label_map), list(label_map.keys())))

    dev_dataset = None
    if config.data.dev:
        dev_dataset, _ = load_data(config.data.input,
                                   'dev.tsv',
                                   tokenizer,
                                   label_map,
                                   cfg=config.data,
                                   cache_dir=cache_dir)
        print('Dev data: {} examples'.format(len(dev_dataset)))

    print('Loading model "{}"'.format(config.model.name))
    model, state = load_model()

    if config.model.do_train:
        print('Start training')
        train(model, train_dataset, dev_dataset, state)

    if config.model.do_export:
        test_dataset, _ = load_data(config.data.input,
                                    'test.tsv',
                                    tokenizer,
                                    label_map,
                                    cfg=config.data,
                                    cache_dir=cache_dir)

        # print('\nExporting train:')
        # export(model, train_dataset, label_map, 'train.tsv')
        if dev_dataset is not None:
            print('Exporting dev')
            export(model, dev_dataset, label_map, 'dev.tsv')

        print('Exporting test')
        export(model, test_dataset, label_map, 'test.tsv')

    print('\nDone!')
Example #8
0
def export(model, dataset, label_map, filename):
    dataloader = DataLoader(dataset,
                            sampler=SequentialSampler(dataset),
                            batch_size=config.eval.batch_size)
    result = evaluate(model,
                      dataloader,
                      return_acc=False,
                      return_labels=True,
                      return_probs=config.summary.probs)

    groups = None
    if config.summary.groups:
        grouped_sents, _ = read_examples(os.path.join(config.data.input,
                                                      filename),
                                         add_labels=2)
        sents = [[s[0] for s in ex] for ex in grouped_sents]
        groups = [[s[1] for s in ex] for ex in grouped_sents]
    else:
        sents = read_examples(os.path.join(config.data.input, filename),
                              add_labels=False)

    label_names = sorted(label_map, key=label_map.get)

    labels_true = result['labels_true']
    labels_pred = result['labels_pred']
    sent_ids = result['sent_ids']
    pred_probs = result['pred_probs'] if config.summary.probs else [
        None
    ] * len(sent_ids)

    if config.model.checkpoint >= 0:
        filename = str(config.model.checkpoint).zfill(3) + '-' + filename

    out_path = os.path.join(config.get_path('output'), filename)
    with open(out_path, 'w') as f:
        prev_sent_id = 0
        token_id = 0
        for label_true, label_pred, sent_id, pred_prob in zip(
                labels_true, labels_pred, sent_ids, pred_probs):
            if sent_id != prev_sent_id:
                if config.data.token_level:
                    f.write('\n')
                prev_sent_id = sent_id
                token_id = 0

            true, pred = label_names[label_true], label_names[label_pred]
            if token_id >= len(sents[sent_id]):
                print('skipping sent={} token={} true={} pred={}'.format(
                    sent_id, token_id, true, pred))
                continue

            token = sents[sent_id][token_id] if groups is None else groups[
                sent_id][token_id]
            out = [token, true, pred]
            if config.summary.probs:
                out.append(str(pred_prob.item()))
            f.write('\t'.join(out) + '\n')

            token_id += 1

    print('Predictions are exported to {}'.format(out_path))
Example #9
0
def train(model, train_dataset, dev_dataset=None, state=None):
    writer = SummaryWriter(config.get_path('logs'))

    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=config.train.batch_size)

    if dev_dataset is not None:
        dev_dataloader = DataLoader(dev_dataset,
                                    sampler=SequentialSampler(dev_dataset),
                                    batch_size=config.eval.batch_size)

    optimizer, scheduler = prepare_optimizer(model, train_dataloader)
    torch.random.manual_seed(config.train.seed)

    model.zero_grad()

    # Get step intervals
    gradient_steps = config.train.gradient_accumulation_steps
    logging_steps = config.train.logging_steps
    if logging_steps < 1:
        logging_steps = int(
            len(train_dataloader) // gradient_steps * logging_steps)
    if logging_steps == 0:
        logging_steps = 1
    eval_steps = config.train.eval_steps
    if eval_steps < 1:
        eval_steps = int(len(train_dataloader) // gradient_steps * eval_steps)

    print('Global step intervals: Logging={} Eval={}'.format(
        logging_steps, eval_steps))

    current_epoch = 0
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0

    # Restore previous checkpoint
    if state is not None:
        torch.random.set_rng_state(state['random_state'].cpu())
        optimizer.load_state_dict(state['optimizer'])
        scheduler.load_state_dict(state['scheduler'])
        current_epoch = state['epoch'] + 1
        global_step = state['global_step']

    print('Starting at epoch {}'.format(current_epoch))

    for epoch in range(current_epoch, config.train.max_epochs):
        print(' > Start epoch {}/{}'.format(epoch, config.train.max_epochs))

        n_correct, n_total = 0, 0

        for step, batch in enumerate(
                tqdm(train_dataloader,
                     desc="Batch",
                     disable=not config.verbose)):
            model.train()

            inputs, true_labels, label_mask = prepare_batch(batch)

            outputs = model(**inputs)
            loss, out = outputs[:2]
            loss.backward()

            tr_loss += loss.item()

            pred_labels = out.argmax(-1)
            pred_labels = pred_labels.reshape(*true_labels.shape)

            n_correct += (label_mask *
                          (pred_labels == true_labels)).sum().item()
            n_total += label_mask.sum().item(
            ) if config.data.token_level else true_labels.shape[0]

            # print(n_correct, n_total, true_labels.shape, pred_labels.shape, (pred_labels == true_labels).shape)
            assert n_correct <= n_total

            if (step + 1) % gradient_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               config.train.max_grad_norm)

                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1

                if global_step % logging_steps == 0:
                    lr = scheduler.get_lr()[0]
                    loss = (tr_loss - logging_loss) / logging_steps
                    acc = n_correct / n_total

                    tqdm.write(
                        'Epoch={} Step={} lr={:.9f} loss={:.3f} acc={:.3f}'.
                        format(epoch, global_step, lr, loss, acc))

                    writer.add_scalar("Learning Rate", lr, global_step)
                    writer.add_scalar("Loss/Train", loss, global_step)
                    writer.add_scalar("Accuracy/Train", acc, global_step)
                    logging_loss = tr_loss

        # Save checkpoint
        save_checkpoint(model, optimizer, scheduler, epoch, global_step)

        # Evaluation
        if dev_dataset is not None:
            eval_result = evaluate(model, dev_dataloader)
            writer.add_scalar("Loss/Eval", eval_result['loss'], epoch)
            writer.add_scalar("Accuracy/Eval", eval_result['acc'], epoch)
            tqdm.write('Evaluation: Epoch={} loss={:.3f} acc={:.3f}'.format(
                epoch, eval_result['loss'], eval_result['acc']))

    writer.close()
Example #10
0
import numpy as np
import pandas as pd
import sys

sys.path.append("./")
sys.path.append("../")
import utils.config as cfg

path_to_lookup = cfg.get_path("data/raw/IDs_mapping.csv")

path_to_raw = cfg.get_path_to_raw_data()
path_to_processed = cfg.get_path_to_processed_data()


def process_raw_data(rawdata_path, output_path):
    df = pd.read_csv(rawdata_path, sep=",")
    processed = process_dataframe(df)
    processed.to_csv(output_path, index=False, header=True)


def process_dataframe(df):
    # Only shape the target if it is present
    if "readmitted" in df:
        df["readmitted"] = np.where(df["readmitted"] == "NO", 0, 1)

    # Get the encoding lookup data
    lookup = pd.read_csv(path_to_lookup, sep=",")
    d = lookup.set_index('admission_type_id')['description'].to_dict()

    # Before we apply it we need the codes to be strings
    df['admission_type_id'] = np.vectorize(str)(df['admission_type_id'])