def load_model_components(model_name, hierarchy, run_type='training', device='cpu', model_file=None, optimizer_file=None, counts_file=None, cluster=False, sentences_per_checkpoint=p.sentences_per_checkpoint): counts = None if counts_file is None else read_pickle(counts_file) batcher = model_components[model_name]['batcher_class'](hierarchy, counts, run_type) model = model_components[model_name]['model_class']( device, batcher, cluster, sentences_per_checkpoint) if model_file is not None: model.load_state_dict(torch.load(model_file, map_location='cpu')) model.correct_devices() postprocessor = model_components[model_name]['postprocessor'](batcher, run_type) if run_type == 'training': model.train() optimizer = model_components[model_name]['optimizer_class'](list( model.parameters())) if optimizer_file is not None: optimizer.load_state_dict(torch.load(optimizer_file)) return batcher, model, postprocessor, optimizer else: model.eval() return batcher, model, postprocessor
def main(model_type, val_file, checkpoint_folder, hierarchy, supervised=False, device='cuda:0', batch_size=p.batch_size, limit_rows_val=p.limit_rows_val, subbatches=p.subbatches, num_workers=p.num_workers, email_sender=None, results_folder=None, noload=False): if checkpoint_folder is None or noload: seed_state() else: set_random_state(read_pickle(os.path.join(checkpoint_folder, 'random_state.pkl'))) logger.set_verbosity(2) val_dataset = init_dataset(val_file, limit_rows=limit_rows_val) val_indices_iterator = init_indices_iterator(len(val_dataset), batch_size) model_file = os.path.join(checkpoint_folder, 'model_state.tpkl') if not noload else None batcher, model, postprocessor = load_model_components(model_type, hierarchy, run_type='testing', device=device, model_file=model_file, cluster=supervised) val_iterator = batcher.batch_iterator(val_dataset, val_indices_iterator, subbatches=subbatches, num_workers=num_workers) if torch.distributed.is_initialized(): model = LDDP(model, torch.distributed.get_world_size()) tester = Tester(model, postprocessor, val_iterator) # tester = Tester(model, postprocessor, val_iterator, tensorboard_dir=os.path.join(load_checkpoint_folder, 'tensorboard/test')) if results_folder is None: results_folder = os.path.join(checkpoint_folder, 'results') os.mkdir(results_folder) postprocessor.add_output_dir(results_folder) total_output_batch = tester.test() with open(os.path.join(results_folder, 'scores.txt'), 'w') as f: f.write(str(total_output_batch)) #total_output_batch.write_results() #write_pickle(postprocessor.summary_stats, os.path.join(checkpoint_folder, 'summary_stats.pkl')) if email_sender is not None: def onerror(e): if check_attachment_error(e): logger.log("Trying to send without attachment") email_sender.send_email(str(total_output_batch)) else: default_onerror(e) attachments = postprocessor.get_summary_attachment_generator() email_sender.send_email("Testing is done!\n\n"+str(total_output_batch), attachments=attachments, onerror=onerror)
def instances_to_data(instances_dir, output_data_dir, limit_to='annotations'): subprocess.run(["mkdir", output_data_dir]) output_data_file = join(output_data_dir, 'annotations.data') output_hierarchy_file = join(output_data_dir, 'hierarchy.pkl') if limit_to == 'all': raise NotImplementedError elif limit_to == 'annotations': subdirectories = next(iter(walk(instances_dir)))[1] instances = {} global_info = None old_to_new = {} # add annotations to instances for dir in subdirectories: global_info_file = join(instances_dir, dir, 'global_info.pkl') if not exists(global_info_file): continue global_info, old_to_new = merge(global_info, read_pickle(global_info_file)) if not dir.endswith('_annotations'): continue annotations = next(iter(walk(join(instances_dir, dir))))[2] annotations = set(annotations) for annotation_file in annotations: if not annotation_file.startswith('instance_'): continue annotation = read_pickle( join(instances_dir, dir, annotation_file)) idx = int(annotation_file[len('instance_'):-len('.pkl')]) if idx not in instances.keys(): instances[idx] = read_pickle( join(instances_dir, annotation_file)) instances[idx]['annotations'] = {} instances[idx]['annotations'][ dir] = add_parents_to_annotations( convert_annotations(annotation, old_to_new), global_info['hierarchy']) df = pd.DataFrame(instances).transpose() df.annotations = df.annotations.apply(lambda x: json.dumps(x)) df.to_csv(output_data_file, compression='gzip') write_pickle(global_info["hierarchy"], output_hierarchy_file) else: # TODO: set file generator for the subdirectory's instances raise NotImplementedError
def __init__(self, models_to_load=[], device='cpu'): super(FullModelInterface, self).__init__() self.models = models_to_load self.dps = { k:DefaultProcessor( model_info[k][0], Hierarchy.from_dict(read_pickle(os.path.join(model_info[k][1], 'hierarchy.pkl')))\ if model_info[k][1] is not None else self.hierarchy, model_file=os.path.join(model_info[k][1], 'model_state.tpkl') if model_info[k][1] is not None else None, device=device, cluster=True) for k in self.models} self.trained_queries = {k:get_queries(os.path.join(model_info[k][1], 'used_targets.txt')) if model_info[k][1] is not None else list(self.hierarchy.descriptions.keys()) for k in self.models}
def get_file(): if startup["file"] is None: # Not currently used f = request.files['reports'] filename = 'uploads/' + secure_filename(f.filename) f.save(filename) elif isinstance(startup["file"], str): filename = startup['file'] else: raise Exception instance = read_pickle(filename) #import pdb; pdb.set_trace() targets = eval(instance['targets']) labels = eval(instance['labels']) positive_targets = [ target for i, target in enumerate(targets) if labels[i] ] print(positive_targets) reports = pd.DataFrame(eval(instance['reports'])) reports['date'] = pd.to_datetime(reports['date']) results1 = startup['interface'].tokenize(reports) results1['original_reports'] = [ (i, report.report_type, str(report.date), report.text) for i, report in results1['original_reports'].iterrows() ] future_reports = pd.DataFrame(eval(instance['future_reports'])) future_reports['date'] = pd.to_datetime(future_reports['date']) results2 = startup['interface'].tokenize(future_reports, num_sentences=None) results2['original_reports'] = [ (i, report.report_type, str(report.date), report.text) for i, report in results2['original_reports'].iterrows() ] startup['tab_reports'] = [reports, future_reports] startup['tab_results'] = [results1, results2] return { "tab_results": startup['tab_results'], "positive_targets": positive_targets }
def __init__(self): self.hierarchy = Hierarchy.from_graph(read_pickle(codes_file)) self.batcher = Batcher(self.hierarchy) self.linearizations = {n:self.hierarchy.linearize(n) for n in self.hierarchy.descriptions.keys()}
def load(filename): """ Loads an IndicesIterator from file using pickle """ return read_pickle(filename)
def index(): if startup['file'] is None: return render_template('done.html') print(startup['file']) tabs = [ ('future-reports', 'Future Reports', 'annotate the reports from the 12 month window after the first mr', 'annotate', 1, None, True, True), ] if startup['include_past_reports']: tabs.append(( 'past-reports', 'Past Reports', 'annotate the last 1000 sentences before the first mr in the past reports', 'annotate', 0, None, True, False), ) models = startup['interface'].get_models() random.shuffle(models) print(models) startup['curr_models'] = {} for i, k in enumerate(models): trained_queries = startup['interface'].get_trained_queries(k) with_custom = startup['interface'].with_custom(k) tabs.append( ('model-%i-summaries' % (i + 1), 'Model %i Summaries' % (i + 1), 'validate the model summaries of the past reports', 'validate', 0, trained_queries, with_custom, False)) startup['curr_models']['model-%i-summaries' % (i + 1)] = k progress = startup['file_generator'].progress() num_instances = len(startup['file_generator']) file_from_server = "false" if startup["file"] is None else "true" if exists(join(startup['annotations_dir'], 'global_info.pkl')): global_info = read_pickle( join(startup["annotations_dir"], 'global_info.pkl')) hierarchy = global_info['hierarchy'] custom_tags = global_info['custom_tags'] else: hierarchy = startup['interface'].get_hierarchy() custom_tags = [] file = basename(startup["file"]) print(file) tabs = tabs annotations = read_pickle(join(startup["annotations_dir"], file))\ if exists(join(startup["annotations_dir"], file)) else {} annotations_prime = {} for tab in tabs: key = startup['curr_models'][ tab[0]] if tab[0] in startup['curr_models'].keys() else tab[0] if key in annotations.keys(): annotations_prime[tab[0]] = annotations[key] if isinstance(startup["file"], str): instance = read_pickle(startup['file']) reports = pd.DataFrame(eval(instance['reports'])) patient_mrn = str(reports["patient_id"].iloc[0]) else: patient_mrn = "" return render_template( 'index.html', progress=progress, num_instances=num_instances, file_from_server=file_from_server, hierarchy=hierarchy, custom_tags=custom_tags, file=file, tabs=tabs, annotations=annotations_prime, patient_mrn=patient_mrn, )
import os from pytt.utils import read_pickle from utils import get_queries dataset = '/home/jered/Documents/data/mimic-iii-clinical-database-1.4/preprocessed/reports_and_codes_expanded' # need to add support for ancestors # code_graph_file = # ancestors = True rebalanced = True counts_file = os.path.join(dataset, 'counts.pkl') used_targets_file = os.path.join(dataset, 'used_targets.txt') used_targets = get_queries(used_targets_file) counts = read_pickle(counts_file) micro_counts = [[], [], []] macro_scores = [[], [], []] for k, v in counts.items(): if k not in used_targets: continue total = v[0] + v[1] true_positives = v[1] / 2 if rebalanced else v[1] * v[1] / total micro_counts[0] += [true_positives] positives = total / 2 if rebalanced else v[1] micro_counts[1] += [positives] relevants = v[1] micro_counts[2] += [relevants] if positives != 0: p = true_positives / positives macro_scores[0] += [p] if relevants != 0:
def main(model_type, train_file, hierarchy, counts_file, val_file=None, save_checkpoint_folder=None, load_checkpoint_folder=None, device='cuda:0', batch_size=p.batch_size, epochs=p.epochs, limit_rows_train=p.limit_rows_train, limit_rows_val=p.limit_rows_val, subbatches=p.subbatches, num_workers=p.num_workers, checkpoint_every=p.checkpoint_every, copy_checkpoint_every=p.copy_checkpoint_every, val_every=p.val_every, email_every=None, email_sender=None, expensive_val_every=None, supervised_val_file=None, supervised_val_hierarchy=None, results_folder=None): if load_checkpoint_folder is None: seed_state() else: set_random_state( read_pickle( os.path.join(load_checkpoint_folder, 'random_state.pkl'))) logger.set_verbosity(2) train_dataset = init_dataset(train_file, limit_rows=limit_rows_train) if val_file is not None: val_dataset = init_dataset(val_file, limit_rows=limit_rows_val) if load_checkpoint_folder is None: indices_iterator = init_indices_iterator(len(train_dataset), batch_size, random=True, epochs=epochs) if val_file is not None: val_indices_iterator = init_indices_iterator( len(val_dataset), batch_size, random=True, iterations=len(indices_iterator)) model_file, optimizer_file = None, None else: indices_iterator = read_pickle( os.path.join(load_checkpoint_folder, 'train_indices_iterator.pkl')) indices_iterator.set_stop(epochs=epochs) if val_file is not None: val_indices_iterator = read_pickle( os.path.join(load_checkpoint_folder, 'val_indices_iterator.pkl')) val_indices_iterator.set_stop(iterations=len(indices_iterator)) model_file, optimizer_file = os.path.join( load_checkpoint_folder, 'model_state.tpkl'), os.path.join(load_checkpoint_folder, 'optimizer_state.tpkl') if not os.path.exists(optimizer_file): optimizer_file = None batcher, model, postprocessor, optimizer = load_model_components( model_type, hierarchy, device=device, model_file=model_file, optimizer_file=optimizer_file, counts_file=counts_file) batch_iterator = batcher.batch_iterator(train_dataset, indices_iterator, subbatches=subbatches, num_workers=num_workers) if val_file is not None: val_iterator = batcher.batch_iterator(val_dataset, val_indices_iterator, subbatches=subbatches) else: val_iterator = None if torch.distributed.is_initialized(): model = LDDP(model, torch.distributed.get_world_size()) expensive_val_func = SupervisedTestingFunc(supervised_val_file, model, model_type, supervised_val_hierarchy, device, batch_size, subbatches, num_workers, results_folder, email_sender)\ if expensive_val_every is not None else None tracker = Tracker(checkpoint_folder=save_checkpoint_folder, checkpoint_every=checkpoint_every, copy_checkpoint_every=copy_checkpoint_every, email_every=email_every, email_sender=email_sender, expensive_val_every=expensive_val_every, expensive_val_func=expensive_val_func) # if load_checkpoint_folder is not None: # tracker.needs_graph = False tracker.needs_graph = False trainer = Trainer(model, postprocessor, optimizer, batch_iterator, val_iterator=val_iterator, val_every=val_every, tracker=tracker) with torch.autograd.set_detect_anomaly(False): trainer.train()
receiver_email=p.receiver_email, subject="%s: training %s model" % (socket.gethostname(), args.model_type)) email_sender.send_email("Starting to train %s model." % args.model_type) email_every = p.email_every else: email_sender = None email_every = None train_file = os.path.join(args.data_dir, 'train.data') val_file = os.path.join(args.data_dir, 'val.data') counts_file = os.path.join(args.data_dir, 'counts.pkl') used_targets_file = os.path.join(args.data_dir, 'used_targets.txt') hierarchy = Hierarchy.from_graph(read_pickle(args.code_graph_file)) if args.save_checkpoint_folder is not None: write_pickle( hierarchy.to_dict(), os.path.join(args.save_checkpoint_folder, 'hierarchy.pkl')) if os.path.exists(counts_file): copyfile(counts_file, os.path.join(args.save_checkpoint_folder, 'counts.pkl')) if os.path.exists(used_targets_file): copyfile( used_targets_file, os.path.join(args.save_checkpoint_folder, 'used_targets.txt')) if args.expensive_val_every is not None: supervised_val_file = os.path.join(args.supervised_data_dir,
parser.add_argument("--supervised_data_dir", default=None) parser.add_argument("--results_folder", default=None) args = parser.parse_args() val_file = os.path.join(p.data_dir, 'val.data') if not args.supervised else os.path.join(args.supervised_data_dir, 'supervised.data') if args.email: email_sender = EmailSender(smtp_server=p.smtp_server, port=p.port, sender_email=p.sender_email, sender_password=args.sender_password, receiver_email=p.receiver_email, subject="%s: testing %s model" % (socket.gethostname(), args.model_type)) email_sender.send_email("Starting to test %s model." % args.model_type) else: email_sender = None if args.supervised: hierarchy_file = os.path.join(args.supervised_data_dir, 'hierarchy.pkl') if not os.path.exists(hierarchy_file): hierarchy_file = os.path.join(args.checkpoint_folder, 'hierarchy.pkl') else: hierarchy_file = os.path.join(args.checkpoint_folder, 'hierarchy.pkl') hierarchy = Hierarchy.from_dict(read_pickle(hierarchy_file)) try: main(args.model_type, val_file, args.checkpoint_folder, hierarchy, supervised=args.supervised, device=args.device, email_sender=email_sender, results_folder=args.results_folder, noload=args.noload) # nprocs = 2 # main_distributed = distributed_wrapper(main, nprocs) # main_distributed(args.model_type, val_file, args.checkpoint_folder, device=args.device) except Exception as e: if email_sender is not None: email_sender("Got an exception:\n%s" % e) raise e