Ejemplo n.º 1
0
def load_model_components(model_name,
                          hierarchy,
                          run_type='training',
                          device='cpu',
                          model_file=None,
                          optimizer_file=None,
                          counts_file=None,
                          cluster=False,
                          sentences_per_checkpoint=p.sentences_per_checkpoint):
    counts = None if counts_file is None else read_pickle(counts_file)
    batcher = model_components[model_name]['batcher_class'](hierarchy, counts,
                                                            run_type)
    model = model_components[model_name]['model_class'](
        device, batcher, cluster, sentences_per_checkpoint)
    if model_file is not None:
        model.load_state_dict(torch.load(model_file, map_location='cpu'))
    model.correct_devices()
    postprocessor = model_components[model_name]['postprocessor'](batcher,
                                                                  run_type)
    if run_type == 'training':
        model.train()
        optimizer = model_components[model_name]['optimizer_class'](list(
            model.parameters()))
        if optimizer_file is not None:
            optimizer.load_state_dict(torch.load(optimizer_file))
        return batcher, model, postprocessor, optimizer
    else:
        model.eval()
        return batcher, model, postprocessor
Ejemplo n.º 2
0
def main(model_type, val_file, checkpoint_folder, hierarchy, supervised=False, device='cuda:0', batch_size=p.batch_size, limit_rows_val=p.limit_rows_val, subbatches=p.subbatches, num_workers=p.num_workers, email_sender=None, results_folder=None, noload=False):
    if checkpoint_folder is None or noload:
        seed_state()
    else:
        set_random_state(read_pickle(os.path.join(checkpoint_folder, 'random_state.pkl')))
    logger.set_verbosity(2)
    val_dataset = init_dataset(val_file, limit_rows=limit_rows_val)
    val_indices_iterator = init_indices_iterator(len(val_dataset), batch_size)
    model_file = os.path.join(checkpoint_folder, 'model_state.tpkl') if not noload else None
    batcher, model, postprocessor = load_model_components(model_type, hierarchy, run_type='testing', device=device, model_file=model_file, cluster=supervised)
    val_iterator = batcher.batch_iterator(val_dataset, val_indices_iterator, subbatches=subbatches, num_workers=num_workers)
    if torch.distributed.is_initialized():
        model = LDDP(model, torch.distributed.get_world_size())
    tester = Tester(model, postprocessor, val_iterator)
#    tester = Tester(model, postprocessor, val_iterator, tensorboard_dir=os.path.join(load_checkpoint_folder, 'tensorboard/test'))
    if results_folder is None:
        results_folder = os.path.join(checkpoint_folder, 'results')
    os.mkdir(results_folder)
    postprocessor.add_output_dir(results_folder)
    total_output_batch = tester.test()
    with open(os.path.join(results_folder, 'scores.txt'), 'w') as f:
        f.write(str(total_output_batch))
    #total_output_batch.write_results()
    #write_pickle(postprocessor.summary_stats, os.path.join(checkpoint_folder, 'summary_stats.pkl'))
    if email_sender is not None:
        def onerror(e):
            if check_attachment_error(e):
                logger.log("Trying to send without attachment")
                email_sender.send_email(str(total_output_batch))
            else:
                default_onerror(e)
        attachments = postprocessor.get_summary_attachment_generator()
        email_sender.send_email("Testing is done!\n\n"+str(total_output_batch), attachments=attachments, onerror=onerror)
def instances_to_data(instances_dir, output_data_dir, limit_to='annotations'):
    subprocess.run(["mkdir", output_data_dir])
    output_data_file = join(output_data_dir, 'annotations.data')
    output_hierarchy_file = join(output_data_dir, 'hierarchy.pkl')
    if limit_to == 'all':
        raise NotImplementedError
    elif limit_to == 'annotations':
        subdirectories = next(iter(walk(instances_dir)))[1]
        instances = {}
        global_info = None
        old_to_new = {}
        # add annotations to instances
        for dir in subdirectories:
            global_info_file = join(instances_dir, dir, 'global_info.pkl')
            if not exists(global_info_file): continue
            global_info, old_to_new = merge(global_info,
                                            read_pickle(global_info_file))
            if not dir.endswith('_annotations'): continue
            annotations = next(iter(walk(join(instances_dir, dir))))[2]
            annotations = set(annotations)
            for annotation_file in annotations:
                if not annotation_file.startswith('instance_'): continue
                annotation = read_pickle(
                    join(instances_dir, dir, annotation_file))
                idx = int(annotation_file[len('instance_'):-len('.pkl')])
                if idx not in instances.keys():
                    instances[idx] = read_pickle(
                        join(instances_dir, annotation_file))
                    instances[idx]['annotations'] = {}
                instances[idx]['annotations'][
                    dir] = add_parents_to_annotations(
                        convert_annotations(annotation, old_to_new),
                        global_info['hierarchy'])
        df = pd.DataFrame(instances).transpose()
        df.annotations = df.annotations.apply(lambda x: json.dumps(x))
        df.to_csv(output_data_file, compression='gzip')
        write_pickle(global_info["hierarchy"], output_hierarchy_file)
    else:
        # TODO: set file generator for the subdirectory's instances
        raise NotImplementedError
Ejemplo n.º 4
0
 def __init__(self, models_to_load=[], device='cpu'):
     super(FullModelInterface, self).__init__()
     self.models = models_to_load
     self.dps = {
         k:DefaultProcessor(
             model_info[k][0],
             Hierarchy.from_dict(read_pickle(os.path.join(model_info[k][1], 'hierarchy.pkl')))\
                 if model_info[k][1] is not None else self.hierarchy,
             model_file=os.path.join(model_info[k][1], 'model_state.tpkl') if model_info[k][1] is not None else None,
             device=device,
             cluster=True)
         for k in self.models}
     self.trained_queries = {k:get_queries(os.path.join(model_info[k][1], 'used_targets.txt'))
                             if model_info[k][1] is not None else list(self.hierarchy.descriptions.keys())
                             for k in self.models}
Ejemplo n.º 5
0
def get_file():
    if startup["file"] is None:  # Not currently used
        f = request.files['reports']
        filename = 'uploads/' + secure_filename(f.filename)
        f.save(filename)
    elif isinstance(startup["file"], str):
        filename = startup['file']
    else:
        raise Exception
    instance = read_pickle(filename)
    #import pdb; pdb.set_trace()
    targets = eval(instance['targets'])
    labels = eval(instance['labels'])
    positive_targets = [
        target for i, target in enumerate(targets) if labels[i]
    ]
    print(positive_targets)
    reports = pd.DataFrame(eval(instance['reports']))
    reports['date'] = pd.to_datetime(reports['date'])
    results1 = startup['interface'].tokenize(reports)
    results1['original_reports'] = [
        (i, report.report_type, str(report.date), report.text)
        for i, report in results1['original_reports'].iterrows()
    ]
    future_reports = pd.DataFrame(eval(instance['future_reports']))
    future_reports['date'] = pd.to_datetime(future_reports['date'])
    results2 = startup['interface'].tokenize(future_reports,
                                             num_sentences=None)
    results2['original_reports'] = [
        (i, report.report_type, str(report.date), report.text)
        for i, report in results2['original_reports'].iterrows()
    ]
    startup['tab_reports'] = [reports, future_reports]
    startup['tab_results'] = [results1, results2]
    return {
        "tab_results": startup['tab_results'],
        "positive_targets": positive_targets
    }
Ejemplo n.º 6
0
 def __init__(self):
     self.hierarchy = Hierarchy.from_graph(read_pickle(codes_file))
     self.batcher = Batcher(self.hierarchy)
     self.linearizations = {n:self.hierarchy.linearize(n) for n in self.hierarchy.descriptions.keys()}
Ejemplo n.º 7
0
 def load(filename):
     """
     Loads an IndicesIterator from file using pickle
     """
     return read_pickle(filename)
Ejemplo n.º 8
0
def index():
    if startup['file'] is None:
        return render_template('done.html')
    print(startup['file'])
    tabs = [
        ('future-reports', 'Future Reports',
         'annotate the reports from the 12 month window after the first mr',
         'annotate', 1, None, True, True),
    ]
    if startup['include_past_reports']:
        tabs.append((
            'past-reports', 'Past Reports',
            'annotate the last 1000 sentences before the first mr in the past reports',
            'annotate', 0, None, True, False), )
    models = startup['interface'].get_models()
    random.shuffle(models)
    print(models)
    startup['curr_models'] = {}
    for i, k in enumerate(models):
        trained_queries = startup['interface'].get_trained_queries(k)
        with_custom = startup['interface'].with_custom(k)
        tabs.append(
            ('model-%i-summaries' % (i + 1), 'Model %i Summaries' % (i + 1),
             'validate the model summaries of the past reports', 'validate', 0,
             trained_queries, with_custom, False))
        startup['curr_models']['model-%i-summaries' % (i + 1)] = k
    progress = startup['file_generator'].progress()
    num_instances = len(startup['file_generator'])
    file_from_server = "false" if startup["file"] is None else "true"
    if exists(join(startup['annotations_dir'], 'global_info.pkl')):
        global_info = read_pickle(
            join(startup["annotations_dir"], 'global_info.pkl'))
        hierarchy = global_info['hierarchy']
        custom_tags = global_info['custom_tags']
    else:
        hierarchy = startup['interface'].get_hierarchy()
        custom_tags = []
    file = basename(startup["file"])
    print(file)
    tabs = tabs
    annotations = read_pickle(join(startup["annotations_dir"], file))\
                  if exists(join(startup["annotations_dir"], file)) else {}
    annotations_prime = {}
    for tab in tabs:
        key = startup['curr_models'][
            tab[0]] if tab[0] in startup['curr_models'].keys() else tab[0]
        if key in annotations.keys():
            annotations_prime[tab[0]] = annotations[key]
    if isinstance(startup["file"], str):
        instance = read_pickle(startup['file'])
        reports = pd.DataFrame(eval(instance['reports']))
        patient_mrn = str(reports["patient_id"].iloc[0])
    else:
        patient_mrn = ""
    return render_template(
        'index.html',
        progress=progress,
        num_instances=num_instances,
        file_from_server=file_from_server,
        hierarchy=hierarchy,
        custom_tags=custom_tags,
        file=file,
        tabs=tabs,
        annotations=annotations_prime,
        patient_mrn=patient_mrn,
    )
Ejemplo n.º 9
0
import os
from pytt.utils import read_pickle
from utils import get_queries

dataset = '/home/jered/Documents/data/mimic-iii-clinical-database-1.4/preprocessed/reports_and_codes_expanded'
# need to add support for ancestors
# code_graph_file =
# ancestors = True
rebalanced = True
counts_file = os.path.join(dataset, 'counts.pkl')
used_targets_file = os.path.join(dataset, 'used_targets.txt')

used_targets = get_queries(used_targets_file)
counts = read_pickle(counts_file)

micro_counts = [[], [], []]
macro_scores = [[], [], []]

for k, v in counts.items():
    if k not in used_targets: continue
    total = v[0] + v[1]
    true_positives = v[1] / 2 if rebalanced else v[1] * v[1] / total
    micro_counts[0] += [true_positives]
    positives = total / 2 if rebalanced else v[1]
    micro_counts[1] += [positives]
    relevants = v[1]
    micro_counts[2] += [relevants]
    if positives != 0:
        p = true_positives / positives
        macro_scores[0] += [p]
    if relevants != 0:
Ejemplo n.º 10
0
def main(model_type,
         train_file,
         hierarchy,
         counts_file,
         val_file=None,
         save_checkpoint_folder=None,
         load_checkpoint_folder=None,
         device='cuda:0',
         batch_size=p.batch_size,
         epochs=p.epochs,
         limit_rows_train=p.limit_rows_train,
         limit_rows_val=p.limit_rows_val,
         subbatches=p.subbatches,
         num_workers=p.num_workers,
         checkpoint_every=p.checkpoint_every,
         copy_checkpoint_every=p.copy_checkpoint_every,
         val_every=p.val_every,
         email_every=None,
         email_sender=None,
         expensive_val_every=None,
         supervised_val_file=None,
         supervised_val_hierarchy=None,
         results_folder=None):
    if load_checkpoint_folder is None:
        seed_state()
    else:
        set_random_state(
            read_pickle(
                os.path.join(load_checkpoint_folder, 'random_state.pkl')))
    logger.set_verbosity(2)
    train_dataset = init_dataset(train_file, limit_rows=limit_rows_train)
    if val_file is not None:
        val_dataset = init_dataset(val_file, limit_rows=limit_rows_val)
    if load_checkpoint_folder is None:
        indices_iterator = init_indices_iterator(len(train_dataset),
                                                 batch_size,
                                                 random=True,
                                                 epochs=epochs)
        if val_file is not None:
            val_indices_iterator = init_indices_iterator(
                len(val_dataset),
                batch_size,
                random=True,
                iterations=len(indices_iterator))
        model_file, optimizer_file = None, None
    else:
        indices_iterator = read_pickle(
            os.path.join(load_checkpoint_folder, 'train_indices_iterator.pkl'))
        indices_iterator.set_stop(epochs=epochs)
        if val_file is not None:
            val_indices_iterator = read_pickle(
                os.path.join(load_checkpoint_folder,
                             'val_indices_iterator.pkl'))
            val_indices_iterator.set_stop(iterations=len(indices_iterator))
        model_file, optimizer_file = os.path.join(
            load_checkpoint_folder,
            'model_state.tpkl'), os.path.join(load_checkpoint_folder,
                                              'optimizer_state.tpkl')
        if not os.path.exists(optimizer_file):
            optimizer_file = None
    batcher, model, postprocessor, optimizer = load_model_components(
        model_type,
        hierarchy,
        device=device,
        model_file=model_file,
        optimizer_file=optimizer_file,
        counts_file=counts_file)
    batch_iterator = batcher.batch_iterator(train_dataset,
                                            indices_iterator,
                                            subbatches=subbatches,
                                            num_workers=num_workers)
    if val_file is not None:
        val_iterator = batcher.batch_iterator(val_dataset,
                                              val_indices_iterator,
                                              subbatches=subbatches)
    else:
        val_iterator = None
    if torch.distributed.is_initialized():
        model = LDDP(model, torch.distributed.get_world_size())
    expensive_val_func = SupervisedTestingFunc(supervised_val_file, model, model_type, supervised_val_hierarchy, device, batch_size, subbatches, num_workers, results_folder, email_sender)\
                         if expensive_val_every is not None else None
    tracker = Tracker(checkpoint_folder=save_checkpoint_folder,
                      checkpoint_every=checkpoint_every,
                      copy_checkpoint_every=copy_checkpoint_every,
                      email_every=email_every,
                      email_sender=email_sender,
                      expensive_val_every=expensive_val_every,
                      expensive_val_func=expensive_val_func)
    #    if load_checkpoint_folder is not None:
    #        tracker.needs_graph = False
    tracker.needs_graph = False
    trainer = Trainer(model,
                      postprocessor,
                      optimizer,
                      batch_iterator,
                      val_iterator=val_iterator,
                      val_every=val_every,
                      tracker=tracker)
    with torch.autograd.set_detect_anomaly(False):
        trainer.train()
Ejemplo n.º 11
0
                                   receiver_email=p.receiver_email,
                                   subject="%s: training %s model" %
                                   (socket.gethostname(), args.model_type))
        email_sender.send_email("Starting to train %s model." %
                                args.model_type)
        email_every = p.email_every
    else:
        email_sender = None
        email_every = None

    train_file = os.path.join(args.data_dir, 'train.data')
    val_file = os.path.join(args.data_dir, 'val.data')
    counts_file = os.path.join(args.data_dir, 'counts.pkl')
    used_targets_file = os.path.join(args.data_dir, 'used_targets.txt')

    hierarchy = Hierarchy.from_graph(read_pickle(args.code_graph_file))

    if args.save_checkpoint_folder is not None:
        write_pickle(
            hierarchy.to_dict(),
            os.path.join(args.save_checkpoint_folder, 'hierarchy.pkl'))
        if os.path.exists(counts_file):
            copyfile(counts_file,
                     os.path.join(args.save_checkpoint_folder, 'counts.pkl'))
        if os.path.exists(used_targets_file):
            copyfile(
                used_targets_file,
                os.path.join(args.save_checkpoint_folder, 'used_targets.txt'))

    if args.expensive_val_every is not None:
        supervised_val_file = os.path.join(args.supervised_data_dir,
Ejemplo n.º 12
0
    parser.add_argument("--supervised_data_dir", default=None)
    parser.add_argument("--results_folder", default=None)
    args = parser.parse_args()

    val_file = os.path.join(p.data_dir, 'val.data') if not args.supervised else os.path.join(args.supervised_data_dir, 'supervised.data')

    if args.email:
        email_sender = EmailSender(smtp_server=p.smtp_server, port=p.port, sender_email=p.sender_email, sender_password=args.sender_password, receiver_email=p.receiver_email, subject="%s: testing %s model" % (socket.gethostname(), args.model_type))
        email_sender.send_email("Starting to test %s model." % args.model_type)
    else:
        email_sender = None

    if args.supervised:
        hierarchy_file = os.path.join(args.supervised_data_dir, 'hierarchy.pkl')
        if not os.path.exists(hierarchy_file):
            hierarchy_file = os.path.join(args.checkpoint_folder, 'hierarchy.pkl')
    else:
        hierarchy_file = os.path.join(args.checkpoint_folder, 'hierarchy.pkl')

    hierarchy = Hierarchy.from_dict(read_pickle(hierarchy_file))

    try:
        main(args.model_type, val_file, args.checkpoint_folder, hierarchy, supervised=args.supervised, device=args.device, email_sender=email_sender, results_folder=args.results_folder, noload=args.noload)
#        nprocs = 2
#        main_distributed = distributed_wrapper(main, nprocs)
#        main_distributed(args.model_type, val_file, args.checkpoint_folder, device=args.device)
    except Exception as e:
        if email_sender is not None:
            email_sender("Got an exception:\n%s" % e)
        raise e