Exemple #1
0
def _main():
    # read preprocessed data from disk
    with open('data_preprocessed.pickle', 'rb') as data_file:
        data_dict = pickle.load(data_file)

    # remove all but the most important features
    feature_indices = find_most_useful_feature_indices(data_dict['data'],
                                                       data_dict['labels'],
                                                       200)
    data = data_dict['data'][:, feature_indices]
    print(data.shape)

    clustering = SpectralClustering(n_clusters=NUMBER_OF_CLUSTERS,
                                    n_components=NUMBER_OF_CLUSTERS)
    # for instance i, cluster_indices[i] is the index of the cluster the instance is in
    cluster_indices = clustering.fit_predict(data)

    # restructure the cluster information so that clusters[i] contains the indices of all instances in cluster i
    clusters = []
    for i in range(NUMBER_OF_CLUSTERS + 5):
        clusters.append(np.argwhere(cluster_indices == i).flatten())

    # replace indices with respective subreddit names
    subreddit_names = get_subreddit_names()
    clusters_with_names = [[subreddit_names[sub] for sub in cluster]
                           for cluster in clusters]

    print("Clusters:")
    for i, cluster in enumerate(clusters_with_names):
        print(f"{i}: {', '.join(cluster)}")

    # evaluate clustering
    print_metrics(cluster_indices.tolist(), data_dict['labels'])
Exemple #2
0
 def on_epoch_end(self, epoch, logs=None):
     yhat_raw = model.predict_generator(gen_slim_test, steps=test_samples)
     yhat = np.round(yhat_raw)
     #get metrics
     k = 5
     metrics = evaluation.all_metrics(yhat=yhat,
                                      y=test_y,
                                      k=k,
                                      yhat_raw=yhat_raw)
     evaluation.print_metrics(metrics)
Exemple #3
0
def test(model, Y, epoch, data_path, fold, gpu, version, code_inds, dicts,
         samples, model_dir, testing):
    """
        Testing loop.
        Returns metrics
    """
    filename = data_path.replace('train', fold)
    print('file for evaluation: %s' % filename)
    num_labels = len(dicts['ind2c'])

    #initialize stuff for saving attention samples
    if samples:
        tp_file = open('%s/tp_%s_examples_%d.txt' % (model_dir, fold, epoch),
                       'w')
        fp_file = open('%s/fp_%s_examples_%d.txt' % (model_dir, fold, epoch),
                       'w')
        window_size = model.conv.weight.data.size()[2]

    y, yhat, yhat_raw, hids, losses = [], [], [], [], []
    ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts[
        'ind2c'], dicts['c2ind']

    desc_embed = model.lmbda > 0
    if desc_embed and len(code_inds) > 0:
        unseen_code_vecs(model, code_inds, dicts, gpu)

    model.eval()
    gen = datasets.data_generator(filename,
                                  dicts,
                                  1,
                                  num_labels,
                                  version=version,
                                  desc_embed=desc_embed)
    for batch_idx, tup in tqdm(enumerate(gen)):
        data, target, hadm_ids, _, descs = tup
        data, target = Variable(torch.LongTensor(data),
                                volatile=True), Variable(
                                    torch.FloatTensor(target))
        if gpu:
            data = data.cuda()
            target = target.cuda()
        model.zero_grad()

        if desc_embed:
            desc_data = descs
        else:
            desc_data = None

        #get an attention sample for 2% of batches
        get_attn = samples and (np.random.rand() < 0.02 or
                                (fold == 'test' and testing))
        output, loss, alpha = model(data,
                                    target,
                                    desc_data=desc_data,
                                    get_attention=get_attn)

        output = F.sigmoid(output)
        output = output.data.cpu().numpy()
        losses.append(loss.item())
        target_data = target.data.cpu().numpy()
        if get_attn and samples:
            interpret.save_samples(data,
                                   output,
                                   target_data,
                                   alpha,
                                   window_size,
                                   epoch,
                                   tp_file,
                                   fp_file,
                                   dicts=dicts)

        #save predictions, target, hadm ids
        yhat_raw.append(output)
        output = np.round(output)
        y.append(target_data)
        yhat.append(output)
        hids.extend(hadm_ids)

    #close files if needed
    if samples:
        tp_file.close()
        fp_file.close()

    y = np.concatenate(y, axis=0)
    yhat = np.concatenate(yhat, axis=0)
    yhat_raw = np.concatenate(yhat_raw, axis=0)

    #write the predictions
    preds_file = persistence.write_preds(yhat, model_dir, hids, fold, ind2c,
                                         yhat_raw)
    #get metrics
    k = 5 if num_labels == 50 else [8, 15]
    metrics = evaluation.all_metrics(yhat, y, k=k, yhat_raw=yhat_raw)
    evaluation.print_metrics(metrics)
    metrics['loss_%s' % fold] = np.mean(losses)
    return metrics
def test(model, epoch, batch_size, data_path, fold, gpu, dicts, samples,
         model_dir, testing, debug):
    """
        Testing loop.
        Returns metrics
    """
    filename = data_path.replace('train', fold)
    print('file for evaluation: %s' % filename)

    #    num_labels = tools.get_num_labels(Y, version)

    #initialize stuff for saving attention samples
    if samples:
        tp_file = open('%s/tp_%s_examples_%d.txt' % (model_dir, fold, epoch),
                       'w')
        fp_file = open('%s/fp_%s_examples_%d.txt' % (model_dir, fold, epoch),
                       'w')
        window_size = model.conv.weight.data.size()[2]

    y, yhat, yhat_raw, hids, losses = [], [], [], [], []

    #    ind2w, w2ind, ind2c, c2ind = dicts[0], dicts[1], dicts[2], dicts[3]
    ind2w, w2ind = dicts[0], dicts[1]

    #    desc_embed = model.lmbda > 0
    #    if desc_embed and len(code_inds) > 0:
    #        unseen_code_vecs(model, code_inds, dicts)

    model.eval()
    gen = datasets.data_generator(filename, dicts, batch_size)
    for batch_idx, tup in tqdm(enumerate(gen)):
        if debug and batch_idx > 50:
            break


#        data, target, hadm_ids, _, descs = tup
        data, target, hadm_ids = tup
        data, target = Variable(torch.LongTensor(data),
                                volatile=True), Variable(
                                    torch.FloatTensor(target))
        if gpu:
            data = data.cuda()
            target = target.cuda()

        model.zero_grad()

        #        if desc_embed:
        #            desc_data = descs
        #        else:
        #            desc_data = None

        #        get_attn = samples and (np.random.rand() < 0.02 or (fold == 'test' and testing))

        #        output, loss, alpha = model(data, target, desc_data=desc_data, get_attention=get_attn)
        output, loss, alpha = model(data, target)

        output = output.data.cpu().numpy()
        losses.append(loss.data[0])
        target_data = target.data.cpu().numpy()

        #        if get_attn and samples:
        #            interpret.save_samples(data, output, target_data, alpha, window_size, epoch, tp_file, fp_file, freq_params[0], dicts=dicts)

        #save predictions, target, hadm ids
        yhat_raw.append(output)  # NEED TO KNOW FORM OF OUTPUT
        output = np.round(output)
        y.append(target_data)
        yhat.append(output)
        hids.extend(hadm_ids)

    if samples:
        tp_file.close()
        fp_file.close()

    y = np.concatenate(y, axis=0)
    yhat = np.concatenate(yhat, axis=0)
    yhat_raw = np.concatenate(yhat_raw, axis=0)

    print("y shape: " + str(y.shape))
    print("yhat shape: " + str(yhat.shape))

    #write the predictions
    #   preds_file = persistence.write_preds(yhat, model_dir, hids, fold, ind2c, yhat_raw)
    preds_file = persistence.write_preds(yhat, model_dir, hids, fold, yhat_raw)

    #get metrics
    #    k = 5 if num_labels == 50 else 8

    #    metrics = evaluation.all_metrics(yhat, y, k=k, yhat_raw=yhat_raw)
    metrics = evaluation.all_metrics(yhat, y, yhat_raw=yhat_raw)
    evaluation.print_metrics(metrics)
    metrics['loss_%s' % fold] = np.mean(losses)
    return metrics
def main(Y, train_fname, dev_fname, vocab_file, version, n):
    n = int(n)

    #need to handle really large text fields
    csv.field_size_limit(sys.maxsize)

    #get lookups from non-BOW data
    data_path = train_fname.replace(
        '_bows', '') if "_bows" in train_fname else train_fname
    dicts = datasets.load_lookups(data_path,
                                  vocab_file=vocab_file,
                                  Y=Y,
                                  version=version)
    w2ind, ind2c, c2ind = dicts['w2ind'], dicts['ind2c'], dicts['c2ind']

    X, yy_tr, hids_tr = read_bows(Y, train_fname, c2ind, version)
    X_dv, yy_dv, hids_dv = read_bows(Y, dev_fname, c2ind, version)

    print("X.shape: " + str(X.shape))
    print("yy_tr.shape: " + str(yy_tr.shape))
    print("X_dv.shape: " + str(X_dv.shape))
    print("yy_dv.shape: " + str(yy_dv.shape))

    #deal with labels that don't have any positive examples
    #drop empty columns from yy. keep track of which columns kept
    #predict on test data with those columns. guess 0 on the others
    labels_with_examples = yy_tr.sum(axis=0).nonzero()[0]
    yy = yy_tr[:, labels_with_examples]

    # build the classifier
    clf = OneVsRestClassifier(LogisticRegression(C=C,
                                                 max_iter=MAX_ITER,
                                                 solver='sag'),
                              n_jobs=-1)

    # train
    print("training...")
    clf.fit(X, yy)

    #predict
    print("predicting...")
    yhat = clf.predict(X_dv)
    yhat_raw = clf.predict_proba(X_dv)

    #deal with labels that don't have positive training examples
    print("reshaping output to deal with labels missing from train set")
    labels_with_examples = set(labels_with_examples)
    yhat_full = np.zeros(yy_dv.shape)
    yhat_full_raw = np.zeros(yy_dv.shape)
    j = 0
    for i in range(yhat_full.shape[1]):
        if i in labels_with_examples:
            yhat_full[:, i] = yhat[:, j]
            yhat_full_raw[:, i] = yhat_raw[:, j]
            j += 1

    #evaluate
    metrics, fpr, tpr = evaluation.all_metrics(yhat_full,
                                               yy_dv,
                                               k=[8, 15],
                                               yhat_raw=yhat_full_raw)
    evaluation.print_metrics(metrics)

    #save metric history, model, params
    print("saving predictions")
    model_dir = os.path.join(
        MODEL_DIR,
        '_'.join(["log_reg",
                  time.strftime('%b_%d_%H:%M', time.localtime())]))
    os.mkdir(model_dir)
    preds_file = tools.write_preds(yhat_full, model_dir, hids_dv, 'test',
                                   yhat_full_raw)

    print("sanity check on train")
    yhat_tr = clf.predict(X)
    yhat_tr_raw = clf.predict_proba(X)

    #reshape output again
    yhat_tr_full = np.zeros(yy_tr.shape)
    yhat_tr_full_raw = np.zeros(yy_tr.shape)
    j = 0
    for i in range(yhat_tr_full.shape[1]):
        if i in labels_with_examples:
            yhat_tr_full[:, i] = yhat_tr[:, j]
            yhat_tr_full_raw[:, i] = yhat_tr_raw[:, j]
            j += 1

    #evaluate
    metrics_tr, fpr_tr, tpr_tr = evaluation.all_metrics(
        yhat_tr_full, yy_tr, k=[8, 15], yhat_raw=yhat_tr_full_raw)
    evaluation.print_metrics(metrics_tr)

    if n > 0:
        print("generating top important ngrams")
        if 'bows' in dev_fname:
            dev_fname = dev_fname.replace('_bows', '')
        print("calculating top ngrams using file %s" % dev_fname)
        calculate_top_ngrams(dev_fname, clf, c2ind, w2ind,
                             labels_with_examples, n)

    #Commenting this out because the models are huge (11G for mimic3 full)
    #print("saving model")
    #with open("%s/model.pkl" % model_dir, 'wb') as f:
    #    pickle.dump(clf, f)

    print("saving metrics")
    metrics_hist = defaultdict(lambda: [])
    metrics_hist_tr = defaultdict(lambda: [])
    for name in metrics.keys():
        metrics_hist[name].append(metrics[name])
    for name in metrics_tr.keys():
        metrics_hist_tr[name].append(metrics_tr[name])
    metrics_hist_all = (metrics_hist, metrics_hist, metrics_hist_tr)
    persistence.save_metrics(metrics_hist_all, model_dir)
#import pdb; pdb.set_trace()

with open('%s/code_scores_test.json' % model_dir, 'r') as f:
    scors = json.load(f)

hadm_ids = sorted(set(golds.keys()).intersection(set(preds.keys())))
yhat = np.zeros((len(hadm_ids), num_labels))
yhat_raw = np.zeros((len(hadm_ids), num_labels))
y = np.zeros((len(hadm_ids), num_labels))

for i, hadm_id in tqdm(enumerate(hadm_ids)):
    yhat_inds = [1 if j in preds[hadm_id] else 0 for j in range(num_labels)]
    yhat_raw_inds = [
        scors[hadm_id][ind2c[j]] if ind2c[j] in scors[hadm_id] else 0
        for j in range(num_labels)
    ]
    gold_inds = [1 if j in golds[hadm_id] else 0 for j in range(num_labels)]
    yhat[i] = yhat_inds
    yhat_raw[i] = yhat_raw_inds
    y[i] = gold_inds

metrics = evaluation.all_metrics(yhat, y)
k = 5 if Y == 50 else 8
prec_at_k = evaluation.precision_at_k(yhat_raw, y, k)
metrics['prec_at_%d' % k] = prec_at_k
rec_at_k = evaluation.recall_at_k(yhat_raw, y, k)
metrics['rec_at_%d' % k] = rec_at_k

evaluation.print_metrics(metrics)
Exemple #7
0
def test(model, Y, epoch, dataset, batch_size, embed_desc, fold, gpu, dicts,
         model_dir):
    """
        Testing loop.
        Returns metrics
    """

    print('file for evaluation: %s' % fold)

    docs, attention, y, yhat, yhat_raw, hids, losses = [], [], [], [], [], [], []

    y_coarse, yhat_coarse, yhat_coarse_raw = [], [], []

    ind2w, w2ind, ind2c, c2ind, desc = dicts['ind2w'], dicts['w2ind'], dicts[
        'ind2c'], dicts['c2ind'], dicts['desc']

    model.eval()

    gen = DataLoader(dataset,
                     batch_size=batch_size,
                     shuffle=False,
                     num_workers=num_workers,
                     collate_fn=collate)

    desc_data = desc
    if embed_desc and gpu:
        desc_data = desc_data.cuda()

    t = tqdm(gen, total=len(gen), ncols=0, file=sys.stdout)
    for batch_idx, tup in enumerate(t):
        data, target, target_coarse, hadm_ids, data_text = tup

        if gpu:
            data, target, target_coarse = data.cuda(), target.cuda(
            ), target_coarse.cuda()

        model.zero_grad()

        if model.hier:
            output, loss, alpha = model(data,
                                        target,
                                        target_coarse,
                                        desc_data=desc_data)
        else:
            output, loss, alpha = model(data, target, desc_data=desc_data)

        if model.hier:
            output, output_coarse = output
            output_coarse = output_coarse.data.cpu().numpy()
            alpha, alpha_coarse = alpha
        else:
            output_coarse = np.zeros([len(output), len(dicts['ind2c_coarse'])])
            for i, y_hat_raw_ in enumerate(output.data.cpu().numpy()):
                if len(np.nonzero(np.round(y_hat_raw_))) == 0:
                    continue
                codes = [
                    str(dicts['ind2c'][ind])
                    for ind in np.nonzero(np.round(y_hat_raw_))[0]
                ]
                codes_coarse = set(str(code).split('.')[0] for code in codes)
                codes_coarse_idx = [
                    dicts['c2ind_coarse'][code_coarse]
                    for code_coarse in codes_coarse
                ]
                output_coarse[i, codes_coarse_idx] = 1

        target_coarse_data = target_coarse.data.cpu().numpy()
        y_coarse.append(target_coarse_data)
        yhat_coarse_raw.append(output_coarse)
        yhat_coarse.append(np.round(output_coarse))

        losses.append(loss.item())
        target_data = target.data.cpu().numpy()

        del data, loss

        #if fold == 'test':
        ##alpha, _ = torch.max(torch.round(output).unsqueeze(-1).expand_as(alpha) * alpha, 1)
        ##alpha = (torch.round(output).byte() | target.byte()).unsqueeze(-1).expand_as(alpha).type('torch.cuda.FloatTensor') * alpha
        #    alpha = [a for a in [a_m for a_m in alpha.data.cpu().numpy()]]
        #else:
        #    alpha = []

        del target

        output = output.data.cpu().numpy()

        #save predictions, target, hadm ids
        yhat_raw.append(output)
        yhat.append(np.round(output))
        y.append(target_data)

        hids.extend(hadm_ids)
        docs.extend(data_text)
        attention.extend(
            alpha[:,
                  [dicts['c2ind'][c] for c in persistence.get_codes()]].cpu())

        t.set_postfix(loss=np.mean(losses))

    level = ''
    k = 5 if len(ind2c) == 50 else [8, 15]

    y_coarse = np.concatenate(y_coarse, axis=0)
    yhat_coarse = np.concatenate(yhat_coarse, axis=0)
    yhat_coarse_raw = np.concatenate(yhat_coarse_raw, axis=0)
    metrics_coarse, _, _ = evaluation.all_metrics(yhat_coarse,
                                                  y_coarse,
                                                  k=k,
                                                  yhat_raw=yhat_coarse_raw,
                                                  level='coarse')
    evaluation.print_metrics(metrics_coarse, level='coarse')

    y = np.concatenate(y, axis=0)
    yhat = np.concatenate(yhat, axis=0)
    yhat_raw = np.concatenate(yhat_raw, axis=0)

    #get metrics
    metrics, metrics_codes, metrics_inst = evaluation.all_metrics(
        yhat, y, k=k, yhat_raw=yhat_raw, level='fine')
    evaluation.print_metrics(metrics, level='fine')
    metrics['loss'] = np.mean(losses)
    metrics.update(metrics_coarse)

    #write the predictions
    if fold == 'test':
        persistence.write_preds(hids, docs, attention, y, yhat, yhat_raw,
                                metrics_inst, model_dir, fold, ind2c, c2ind,
                                dicts['desc_plain'])

    return metrics, metrics_codes, metrics_inst, hids
def test(args, model, Y, epoch, data_path, fold, gpu, version, code_inds,
         dicts, samples, model_dir, testing):
    """
        Testing loop.
        Returns metrics
    """
    filename = data_path.replace('train', fold)
    print('file for evaluation: %s' % filename)
    num_labels = len(dicts['ind2c'])

    #initialize stuff for saving attention samples
    if samples:
        tp_file = open('%s/tp_%s_examples_%d.txt' % (model_dir, fold, epoch),
                       'w')
        fp_file = open('%s/fp_%s_examples_%d.txt' % (model_dir, fold, epoch),
                       'w')
        window_size = model.conv.weight.data.size()[2]

    y, yhat, yhat_raw, hids, losses = [], [], [], [], []
    ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts[
        'ind2c'], dicts['c2ind']

    desc_embed = model.lmbda > 0
    if desc_embed and len(code_inds) > 0:
        unseen_code_vecs(model, code_inds, dicts, gpu)

    if args.model == 'bert':
        if args.redefined_tokenizer:
            bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path,
                                                           do_lower_case=True)
        else:
            bert_tokenizer = BertTokenizer.from_pretrained(
                './pretrained_weights/bert-base-uncased-vocab.txt',
                do_lower_case=True)
    elif args.model == 'biobert':
        if args.redefined_tokenizer:
            bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path,
                                                           do_lower_case=False)
        else:
            bert_tokenizer = BertTokenizer.from_pretrained(
                './pretrained_weights/biobert_pretrain_output_all_notes_150000/vocab.txt',
                do_lower_case=False)
    elif args.model == 'bert-tiny':
        if args.redefined_tokenizer:
            bert_tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path,
                                                           do_lower_case=True)
        else:
            bert_tokenizer = BertTokenizer.from_pretrained(
                './pretrained_weights/bert-tiny-uncased-vocab.txt',
                do_lower_case=True)
    else:
        bert_tokenizer = None

    model.eval()
    gen = datasets.data_generator(filename,
                                  dicts,
                                  1,
                                  num_labels,
                                  version=version,
                                  desc_embed=desc_embed,
                                  bert_tokenizer=bert_tokenizer,
                                  test=True,
                                  max_seq_length=args.max_sequence_length)
    for batch_idx, tup in tqdm(enumerate(gen)):
        data, target, hadm_ids, _, descs = tup
        data, target = torch.LongTensor(data), torch.FloatTensor(target)
        if gpu:
            data = data.cuda()
            target = target.cuda()

        if desc_embed:
            desc_data = descs
        else:
            desc_data = None

        if args.model in ['bert', 'biobert', 'bert-tiny']:
            token_type_ids = (data > 0).long() * 0
            attention_mask = (data > 0).long()
            position_ids = torch.arange(data.size(1)).expand(
                data.size(0), data.size(1))
            if gpu:
                position_ids = position_ids.cuda()
            position_ids = position_ids * (data > 0).long()
        else:
            attention_mask = (data > 0).long()
            token_type_ids = None
            position_ids = None

        if args.model in BERT_MODEL_LIST:
            with torch.no_grad():
                output, loss = model(input_ids=data, \
                                     token_type_ids=token_type_ids, \
                                     attention_mask=attention_mask, \
                                     position_ids=position_ids, \
                                     labels=target, \
                                     desc_data=desc_data, \
                                     pos_labels=None, \
                                    )
            output = torch.sigmoid(output)
            output = output.data.cpu().numpy()
        else:
            with torch.no_grad():
                output, loss, alpha = model(data,
                                            target,
                                            desc_data=desc_data,
                                            get_attention=get_attn)

            #get an attention sample for 2% of batches
            get_attn = samples and (np.random.rand() < 0.02 or
                                    (fold == 'test' and testing))

            output = torch.sigmoid(output)
            output = output.data.cpu().numpy()

            if get_attn and samples:
                interpret.save_samples(data,
                                       output,
                                       target_data,
                                       alpha,
                                       window_size,
                                       epoch,
                                       tp_file,
                                       fp_file,
                                       dicts=dicts)

        losses.append(loss.item())
        target_data = target.data.cpu().numpy()

        #save predictions, target, hadm ids
        yhat_raw.append(output)
        output = np.round(output)
        y.append(target_data)
        yhat.append(output)
        hids.extend(hadm_ids)

    # close files if needed
    if samples:
        tp_file.close()
        fp_file.close()

    y = np.concatenate(y, axis=0)
    yhat = np.concatenate(yhat, axis=0)
    yhat_raw = np.concatenate(yhat_raw, axis=0)

    #write the predictions
    preds_file = persistence.write_preds(yhat, model_dir, hids, fold, ind2c,
                                         yhat_raw)
    #get metrics
    k = 5 if num_labels == 50 else [8, 15]
    metrics = evaluation.all_metrics(yhat, y, k=k, yhat_raw=yhat_raw)
    evaluation.print_metrics(metrics)
    metrics['loss_%s' % fold] = np.mean(losses)
    return metrics