def main():
    f = open("outdir.txt", 'r')
    outdir = f.read().rstrip('\n')
    #create an experiment folder tied to date and time where to save output from the model 
    experiment_folder = os.path.expanduser('~/data1/stratification_ILRM/experiments/') + disease_folder +\
                    '-'.join(map(str, list(datetime.now().timetuple()[:6])))
    os.makedirs(experiment_folder)
    f = open("experiment_folder.txt", 'w') ##path to the experiment folder is saved in a txt file
    f.write(experiment_folder)
    f.close()
    
    ##pass the size of the vocabulary to the model
    with open(os.path.join(outdir, mt_to_ix_file)) as f:
            rd = csv.reader(f)
            next(rd)
            vocab_size = 1
            for r in rd:
                vocab_size+=1

    #set random seed for reproducible experiments
    torch.manual_seed(123)
    torch.cuda.manual_seed(123)

    ##Import data
    data = myData(outdir, ehr_file)
    data_generator = DataLoader(data, model_pars['batch_size'], shuffle=True, collate_fn=my_collate)
    #define model and optimizer
    print("cohort numerosity:{0} -- max_seq_length:{1}".format(len(data), L))
    model = net.ehrEncoding(vocab_size, L, model_pars['embedding_dim'], model_pars['kernel_size'])
    optimizer = torch.optim.Adam(model.parameters(), lr=model_pars['learning_rate'], weight_decay=1e-5)

    #start the unsupervised training and evaluation
    model.cuda()
    loss_fn = net.criterion
    print("Starting training for {} epochs...".format(model_pars['num_epochs']))
    mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator, loss_fn, optimizer, metrics, experiment_folder)
    
    ##save encoded vectors, medical record number list (to keep track of the order) and metric (loss and accuracy)
    with open(experiment_folder + '/encoded_vect.csv', 'w') as f:
        wr = csv.writer(f, delimiter=',')
        for e in encoded:
            wr.writerow(e)

    with open(experiment_folder + '/mrns.csv', 'w') as f:
        wr = csv.writer(f, delimiter=',')
        for m in mrn:
            wr.writerow([m])

    with open(experiment_folder + '/metrics.txt', 'w') as f:
        wr = csv.writer(f, delimiter='\t')
        #for m, v in metrics_average.items():
        #    wr.writerow([m, v])
        wr.writerow(["Mean loss:", metrics_avg['loss']])
        wr.writerow(["Accuracy:", metrics_avg['accuracy']])
def main():

    ##pass the size of the vocabulary to the model
    with open(os.path.join(data_folder, mt_to_ix_file)) as f:
        rd = csv.reader(f)
        vocab_size = 0
        for r in rd:
            vocab_size += 1

    #set random seed for reproducible experiments
    torch.manual_seed(123)
    torch.cuda.manual_seed(123)

    ##Import data
    data = myData(data_folder, ehr_file)
    data_generator = DataLoader(data,
                                model_pars['batch_size'],
                                shuffle=True,
                                collate_fn=my_collate)
    #define model and optimizer
    print("cohort numerosity:{0} -- max_seq_length:{1}".format(len(data), L))
    model = net.ehrEncoding(vocab_size, L, model_pars['embedding_dim'],
                            model_pars['kernel_size'])
    #model = nn.DataParallel(model, device_ids=[1,2,3])
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=model_pars['learning_rate'],
                                 weight_decay=1e-5)

    #start the unsupervised training and evaluation
    model.cuda()
    loss_fn = net.criterion
    print("Starting training for {} epochs...".format(
        model_pars['num_epochs']))
    mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator,
                                                   loss_fn, optimizer,
                                                   experiment_folder, metrics)

    #with open(experiment_folder + '/TRencoded_vect.csv', 'w') as f:
    #    wr = csv.writer(f, delimiter=',')
    #    for e in encoded_tr:
    #        wr.writerow(e)

    #with open(experiment_folder + '/TRmrns.csv', 'w') as f:
    #    wr = csv.writer(f, delimiter=',')
    #    for m in mrn_tr:
    #        wr.writerow([m])

    #with open(experiment_folder + '/TRmetrics.txt', 'w') as f:
    #    wr = csv.writer(f, delimiter = '\t')
    #for m, v in metrics_average.items():
    #    wr.writerow([m, v])
    #    wr.writerow(["Mean loss:", loss_tr])

    ##load and evaluate best model
    #print("Evaluating best model...")
    #best_saved = torch.load(experiment_folder + '/best_model.pt')
    #model.load_state_dict(best_saved['state_dict'])
    #mrn, encoded, metrics_avg = evaluate(model, loss_fn, data_generator, metrics, best_eval=True)

    with open(experiment_folder + '/encoded_vect.csv', 'w') as f:
        wr = csv.writer(f, delimiter=',')
        for e in encoded:
            wr.writerow(e)

    with open(experiment_folder + '/mrns.csv', 'w') as f:
        wr = csv.writer(f, delimiter=',')
        for m in mrn:
            wr.writerow([m])

    with open(experiment_folder + '/metrics.txt', 'w') as f:
        wr = csv.writer(f, delimiter='\t')
        #for m, v in metrics_average.items():
        #    wr.writerow([m, v])
        wr.writerow(["Mean loss:", metrics_avg['loss']])
        wr.writerow(["Accuracy:", metrics_avg['accuracy']])
Exemple #3
0
def learn_patient_representations(indir,
                                  test_set=False,
                                  sampling=None,
                                  emb_filename=None):
    # experiment folder with date and time to save the representations
    exp_dir = os.path.join(indir, 'encodings')
    os.makedirs(exp_dir, exist_ok=True)

    # get the vocabulary size
    fvocab = os.path.join(os.path.join(indir), ut.dt_files['vocab'])
    with open(fvocab) as f:
        rd = csv.reader(f)
        next(rd)
        vocab = {}
        for r in rd:
            vocab[int(r[1])] = r[0]
        vocab_size = len(vocab) + 1
    print('Vocabulary size: {0}'.format(vocab_size))

    # load pre-computed embeddings
    if emb_filename is not None:
        model = Word2Vec.load(emb_filename)
        embs = model.wv
        del model
        print('Loaded pre-computed embeddings for {0} concepts'.format(
            len(embs.vocab)))
    else:
        embs = None

    # set random seed for experiment reproducibility
    torch.manual_seed(123)
    torch.cuda.manual_seed(123)

    # load data
    data_tr = EHRdata(os.path.join(indir), ut.dt_files['ehr-file'], sampling)
    data_generator_tr = DataLoader(data_tr,
                                   ut.model_param['batch_size'],
                                   shuffle=True,
                                   collate_fn=ehr_collate)
    if test_set:
        data_ts = EHRdata(os.path.join(indir), ut.dt_files['ehr-file-test'],
                          sampling)

        data_generator_ts = DataLoader(data_ts,
                                       ut.model_param['batch_size'],
                                       shuffle=True,
                                       collate_fn=ehr_collate)
        print("Test cohort size: {0}".format(len(data_ts)))
    else:
        data_generator_ts = data_generator_tr

    print('Training cohort size: {0}\n'.format(len(data_tr)))
    print('Max Sequence Length: {0}\n'.format(ut.len_padded))
    # define model and optimizer
    print('Learning rate: {0}'.format(ut.model_param['learning_rate']))
    print('Batch size: {0}'.format(ut.model_param['batch_size']))
    print('Kernel size: {0}\n'.format(ut.model_param['kernel_size']))

    model = net.ehrEncoding(vocab_size=vocab_size,
                            max_seq_len=ut.len_padded,
                            emb_size=ut.model_param['embedding_size'],
                            kernel_size=ut.model_param['kernel_size'],
                            pre_embs=embs,
                            vocab=vocab)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=ut.model_param['learning_rate'],
                                 weight_decay=ut.model_param['weight_decay'])

    # training and evaluation
    if torch.cuda.device_count() > 1:
        print('No. of GPUs: {0}\n'.format(torch.cuda.device_count()))
        model = nn.DataParallel(model)
    else:
        model.cuda()
        print('No. of GPUs: 1\n')

    # model.cuda()
    loss_fn = net.criterion
    print('Training for {} epochs\n'.format(ut.model_param['num_epochs']))

    mrn, encoded, encoded_avg, metrics_avg = train_and_evaluate(
        model, data_generator_tr, data_generator_ts, loss_fn, optimizer,
        net.metrics, exp_dir)

    # save results

    # encoded vectors (representations)
    outfile = os.path.join(exp_dir, 'convae-avg_vect.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerow(["MRN", "ENCODED-AVG"])
        for m, e in zip(mrn, encoded_avg):
            wr.writerow([m] + list(e))

    outfile = os.path.join(exp_dir, 'convae_vect.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerow(["MRN", "ENCODED-SUBSEQ"])
        for m, evs in zip(mrn, encoded):
            for e in evs:
                wr.writerow([m] + e)

    # metrics (loss and accuracy)
    outfile = os.path.join(exp_dir, 'metrics.txt')
    with open(outfile, 'w') as f:
        f.write('Mean loss: %.3f\n' % metrics_avg['loss'])
        f.write('Accuracy: %.3f\n' % metrics_avg['accuracy'])

    # ehr subseq with age in days
    outfile = os.path.join(exp_dir,
                           'cohort-ehr-subseq{0}.csv'.format(ut.len_padded))
    with open(os.path.join(os.path.join(indir), 'cohort-ehrseq.csv')) as f:
        rd = csv.reader(f)
        next(rd)
        ehr = {}
        for r in rd:
            ehr.setdefault(r[0], list()).extend(r[1:])
    ehr_subseq = {}
    for list_m, batch in data_generator_tr:
        for b, m in zip(batch, list_m):
            if len(b) == 1:
                ehr_subseq[m] = b.tolist()
            else:
                seq = []
                for vec in b.tolist():
                    seq.extend(vec)
                nseq, nleft = divmod(len(seq), ut.len_padded)
                if nleft > 0:
                    seq = seq + [0] * \
                          (ut.len_padded - nleft)
                for i in range(0, len(seq) - ut.len_padded + 1, ut.len_padded):
                    ehr_subseq.setdefault(m, list()).append(seq[i:i +
                                                                ut.len_padded])
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerow(["MRN", "EHRsubseq"])
        for m, subseq in ehr_subseq.items():
            for seq in subseq:
                wr.writerow([m] + list(filter(lambda x: x != 0, seq)))

    if test_set:
        outfile = os.path.join(
            exp_dir, 'cohort_test-ehr-subseq{0}.csv'.format(ut.len_padded))
        ehr_subseq = {}
        for list_m, batch in data_generator_ts:
            for b, m in zip(batch, list_m):
                if len(b) == 1:
                    ehr_subseq[m] = b.tolist()
                else:
                    seq = []
                    for vec in b.tolist():
                        seq.extend(vec)
                    nseq, nleft = divmod(len(seq), ut.len_padded)
                    if nleft > 0:
                        seq = seq + [0] * \
                              (ut.len_padded - nleft)
                    for i in range(0,
                                   len(seq) - ut.len_padded + 1,
                                   ut.len_padded):
                        ehr_subseq.setdefault(m, list()).append(
                            seq[i:i + ut.len_padded])
        with open(outfile, 'w') as f:
            wr = csv.writer(f)
            wr.writerow(["MRN", "EHRsubseq"])
            for m, subseq in ehr_subseq.items():
                for seq in subseq:
                    wr.writerow([m] + list(filter(lambda x: x != 0, seq)))

    return
torch.cuda.manual_seed(123)

# load data
data_ts = EHRdata(os.path.join(indir), ut.dt_files['ehr-file-test'], sampling)
data_generator_ts = DataLoader(
    data_ts,
    ut.model_param['batch_size'],  # may need to reduce this
    shuffle=False,
    collate_fn=ehr_collate)
print("Test cohort size: {0}".format(len(data_ts)))

# define model and optimizer
model = net.ehrEncoding(
    vocab_size=vocab_size,
    max_seq_len=ut.len_padded,  # 32
    emb_size=ut.model_param['embedding_size'],  # 100
    kernel_size=ut.model_param['kernel_size'],  # 5
    pre_embs=embs,
    vocab=vocab)

optimizer = torch.optim.Adam(model.parameters(),
                             lr=ut.model_param['learning_rate'],
                             weight_decay=ut.model_param['weight_decay'])

model.cuda()

loss_fn = net.criterion

# use model from checkpoint
checkpoint = torch.load(os.path.join(indir, 'encodings', 'best_model.pt'))
model.load_state_dict(checkpoint['model_state_dict'])
def learn_patient_representations(indir,
                                  outdir,
                                  disease_dt,
                                  eval_baseline=False,
                                  sampling=None,
                                  emb_filename=None):

    # experiment folder with date and time to save the representations
    exp_dir = os.path.join(
        outdir, '-'.join([
            disease_dt,
            datetime.now().strftime('%Y-%m-%d-%H-%M-%S'), 'w2v-nobn-softplus'
        ]))
    os.makedirs(exp_dir)

    # get the vocabulary size
    fvocab = os.path.join(indir, ut.dt_files['vocab'])
    with open(fvocab) as f:
        rd = csv.reader(f)
        next(rd)
        vocab = {}
        for r in rd:
            tkn = r[0].split('::')
            tkn[1] = tkn[1].capitalize()
            vocab[int(r[1])] = '::'.join(tkn)
        vocab_size = len(vocab) + 1
    print('Vocabulary size: {0}'.format(vocab_size))

    # load pre-computed embeddings
    if emb_filename is not None:
        model = Word2Vec.load(emb_filename)
        embs = model.wv
        del model
        print('Loaded pre-computed embeddings for {0} concepts'.format(
            len(embs.vocab)))
    else:
        embs = None

    # set random seed for experiment reproducibility
    torch.manual_seed(123)
    torch.cuda.manual_seed(123)

    # load data
    data = EHRdata(indir, ut.dt_files['ehr'], sampling)
    data_generator = DataLoader(data,
                                ut.model_param['batch_size'],
                                shuffle=True,
                                collate_fn=ehr_collate)

    print('Cohort Size: {0} -- Max Sequence Length: {1}\n'.format(
        len(data), ut.len_padded))

    # define model and optimizer
    print('Learning rate: {0}'.format(ut.model_param['learning_rate']))
    print('Batch size: {0}'.format(ut.model_param['batch_size']))
    print('Kernel size: {0}\n'.format(ut.model_param['kernel_size']))

    model = net.ehrEncoding(vocab_size=vocab_size,
                            max_seq_len=ut.len_padded,
                            emb_size=ut.model_param['embedding_size'],
                            kernel_size=ut.model_param['kernel_size'],
                            pre_embs=embs,
                            vocab=vocab)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=ut.model_param['learning_rate'],
                                 weight_decay=ut.model_param['weight_decay'])

    # training and evaluation
    if torch.cuda.device_count() > 1:
        print('No. of GPUs: {0}\n'.format(torch.cuda.device_count()))
        model = nn.DataParallel(model)
    else:
        model.cuda()
        print('No. of GPUs: 1\n')

    # model.cuda()
    loss_fn = net.criterion
    print('Training for {} epochs\n'.format(ut.model_param['num_epochs']))
    mrn, encoded, metrics_avg = train_and_evaluate(model, data_generator,
                                                   loss_fn, optimizer,
                                                   net.metrics, exp_dir)

    # save results

    # encoded vectors (representations)
    outfile = os.path.join(exp_dir, 'encoded_vect.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerows(encoded)

    # MRNs to keep track of the order
    outfile = os.path.join(exp_dir, 'mrns.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        for m in mrn:
            wr.writerow([m])

    # metrics (loss and accuracy)
    outfile = os.path.join(exp_dir, 'metrics.txt')
    with open(outfile, 'w') as f:
        f.write('Mean loss: %.3f\n' % metrics_avg['loss'])
        f.write('Accuracy: %.3f\n' % metrics_avg['accuracy'])

    # evaluate clustering
    gt_file = os.path.join(indir, ut.dt_files['diseases'])
    gt_disease = clu.load_mrn_disease(gt_file)
    min_clu = 2
    max_clu = 10

    if eval_baseline:
        print('\nRunning clustering on the TF-IDF vectors')
        datafile = os.path.join(indir, ut.dt_files['ehr'])
        mrn_idx, svd_mtx = clu.svd_tfidf(datafile, vocab_size)
        gt_disease_raw = [gt_disease[m][0] for m in mrn_idx]
        clu.eval_hierarchical_clustering(svd_mtx, gt_disease_raw, min_clu,
                                         max_clu)

    print('\nRunning clustering on the encoded vectors')
    gt_disease_enc = [gt_disease[m][0] for m in mrn]
    clu.eval_hierarchical_clustering(encoded,
                                     gt_disease_enc,
                                     min_clu,
                                     max_clu,
                                     preproc=True)

    return
def learn_patient_representations(
        indir,
        test_set=False,
        sampling=None,
        emb_filename=None
):
    # encodings folder to save the representations
    exp_dir = os.path.join(indir, 'encodings')
    if test_set:
        exp_dir = os.path.join(indir, 'encodings', 'test')

    os.makedirs(exp_dir, exist_ok=True)

    # get the vocabulary size
    vocab_size, vocab = vocabulary.get_vocab(indir)

    # load pre-computed embeddings
    if emb_filename is not None:
        model = Word2Vec.load(emb_filename)
        embs = model.wv
        del model
        print('Loaded pre-computed embeddings for {0} concepts'.format(
            len(embs.vocab)))
    else:
        embs = None

    # set random seed for experiment reproducibility
    torch.manual_seed(123)
    torch.cuda.manual_seed(123)

    # load data
    data_tr = EHRdata(os.path.join(indir), ut.dt_files['ehr-file'], sampling)
    data_generator_tr = DataLoader(
        data_tr,
        ut.model_param['batch_size'],
        shuffle=True,
        collate_fn=ehr_collate
    )

    if test_set:
        data_ts = EHRdata(os.path.join(indir), ut.dt_files['ehr-file-test'], sampling)

        data_generator_ts = DataLoader(
            data_ts,
            ut.model_param['batch_size'],
            shuffle=True,
            collate_fn=ehr_collate
        )
        print("Test cohort size: {0}".format(len(data_ts)))
    else:
        data_generator_ts = data_generator_tr

    print('Training cohort size: {0}\n'.format(len(data_tr)))
    print('Max Sequence Length: {0}\n'.format(ut.len_padded))
    print('Learning rate: {0}'.format(ut.model_param['learning_rate']))
    print('Batch size: {0}'.format(ut.model_param['batch_size']))
    print('Kernel size: {0}\n'.format(ut.model_param['kernel_size']))

    # define model and optimizer
    model = net.ehrEncoding(
        vocab_size=vocab_size,
        max_seq_len=ut.len_padded,  # 32
        emb_size=ut.model_param['embedding_size'],  # 100
        kernel_size=ut.model_param['kernel_size'],  # 5
        pre_embs=embs,
        vocab=vocab
    )

    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=ut.model_param['learning_rate'],
        weight_decay=ut.model_param['weight_decay']
    )

    # model.cuda()
    if torch.cuda.device_count() > 1:
        print('No. of GPUs: {0}\n'.format(torch.cuda.device_count()))
        model = nn.DataParallel(model)
    else:
        model.cuda()
        print('No. of GPUs: 1\n')

    loss_fn = net.criterion
    print('Training for {} epochs\n'.format(ut.model_param['num_epochs']))

    #only train
    train_and_evaluate(
        model,
        data_generator_tr,
        data_generator_ts,
        loss_fn,
        optimizer,
        net.metrics,
        exp_dir
    )

    # uncomment this out to train AND evaluate
    # will take a really, really long time
    # training and evaluation
    # results of best model are saved to outdir/best_model.pt in this function
    '''
    mrn, encoded, encoded_avg, metrics_avg = train_and_evaluate(
        model,
        data_generator_tr,
        data_generator_ts,
        loss_fn,
        optimizer,
        net.metrics,
        exp_dir
    )

    # save encodings
    # encoded vectors (representations)
    outfile = os.path.join(exp_dir, 'convae_avg_vect.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerow(["MRN", "ENCODED-AVG"])
        for m, e in zip(mrn, encoded_avg):
            wr.writerow([m] + list(e))

    outfile = os.path.join(exp_dir, 'convae_vect.csv')
    with open(outfile, 'w') as f:
        wr = csv.writer(f)
        wr.writerow(["MRN", "ENCODED-SUBSEQ"])
        for m, evs in zip(mrn, encoded):
            for e in evs:
                wr.writerow([m] + e)

    # metrics (loss and accuracy)
    outfile = os.path.join(exp_dir, 'metrics.txt')
    with open(outfile, 'w') as f:
        f.write('Mean loss: %.3f\n' % metrics_avg['loss'])
        f.write('Accuracy: %.3f\n' % metrics_avg['accuracy'])

    # chop patient sequences into fixed subsequences of length L
    # L = ut.len_padded = 32
    # I think that this is here for the human-readable version of how the patient records are subset
    outfile = os.path.join(exp_dir, 'cohort_ehr_subseq{0}.csv'.format(ut.len_padded))
    write_ehr_subseq(data_generator_tr, outfile)

    if test_set:
        outfile = os.path.join(exp_dir, 'test_cohort_ehr_subseq{0}.csv'.format(ut.len_padded))
        write_ehr_subseq(data_generator_ts, outfile)
    '''
    return