Ejemplo n.º 1
0
def main(args):
    feat, case_ids = load_features(args.src)
    lab = load_labels(args.labsrc)

    feat = drop_high_cor(feat, cor_thresh=0.8)
    print('Features after high cor drop')
    print(feat.head())

    run_tsne(feat, lab)
Ejemplo n.º 2
0
def main(args):
    scores = pd.read_csv(args.scores_src, index_col=None, header=0)
    scores.drop(scores.columns[-1], inplace=True, axis=1)
    scores_caseids = scores['Surgical Number']
    scores_caseids = np.array([translate_sn2hash(x) for x in scores_caseids])
    drop_rows = np.squeeze(scores.index.values[scores_caseids == 'drop_me'])
    print('Dropping: ', drop_rows)
    scores['case_id'] = scores_caseids
    scores.drop(drop_rows, inplace=True)
    print(scores.head())
    print(scores.shape)

    features = pd.read_csv(args.feature_src, index_col=0)
    print('Features')
    print(features.head())
    print(features.shape)

    caseids = features['case_id'].values
    features.drop('case_id', axis=1, inplace=True)
    features = drop_high_cor(features, 0.8)
    remaining_features = features.columns

    indices = []
    feature_case_mean = []
    for cid in np.unique(caseids):
        cid_idx = caseids == cid
        f = features.loc[cid_idx, :].values
        fmean = np.mean(f, axis=0)
        print('{}:'.format(cid), fmean.shape)
        feature_case_mean.append(np.expand_dims(fmean, axis=0))
        indices.append(cid)
    
    features = pd.DataFrame(np.concatenate(feature_case_mean, axis=0), columns=remaining_features)
    features['case_id'] = indices
    print('Features grouped by case')
    print(features.head())
    print(features.shape)

    matching_indices = np.intersect1d(features['case_id'], scores['case_id'])
    print('Matched indices:', matching_indices, len(matching_indices))

    # Drop rows from features and scores -- then sort them
    drop_rows = [x for x,c in \
        zip(features.index.values, features['case_id']) if c not in matching_indices]
    features.drop(drop_rows, axis=0, inplace=True)
    print('FEATURES BEFORE SORTING\n', features.head())
    features.sort_values(by='case_id', inplace=True)
    sorted_caseids_features = features['case_id'].values
    features.drop('case_id', axis=1, inplace=True)
    print('FEATURES AFTER SORTING\n', features.head())
    features = features.transform(lambda x: (x - np.mean(x)) / np.std(x))
    print(features.shape)

    drop_rows = [x for x,c in \
        zip(scores.index.values, scores['case_id'].values) if c not in matching_indices]
    scores.drop(drop_rows, axis=0, inplace=True)
    # shuffle columns
    print('SCORES BEFORE SORTING\n', scores.head())
    scores.sort_values(by='case_id', inplace=True)
    sorted_caseids_scores = scores['case_id'].values
    to_drop = ['case_id', 'caseid', 'Disease Stage', 'sample name', 'Surgical Number']
    scores.drop(to_drop, axis=1, inplace=True)
    print('SCORES AFTER SORTING\n', scores.head())
    print(scores.shape)

    for fid, sid in zip(sorted_caseids_features, sorted_caseids_scores):
        print(fid, sid)
        assert fid == sid

    fig = plt.figure(figsize=(2,2), dpi=300)

    logfile = os.path.join(args.dst, 'qvalues.csv')
    comparison_ids = []
    pvalues = []
    for c in features.columns:
        cx = features[c].values
        for s in scores.columns:
            sy = scores[s].values
            try: 
                corr = spearmanr(cx, sy)
                pcorr = pearsonr(cx, sy)
            except:
                print('Failed at {} x {}'.format(c, s))
                print('cx: {} sy: {}'.format(cx.shape, sy.shape))
            comparison_ids.append('{}_{}'.format(c, s))
            pvalues.append(corr.pvalue)
            if corr.pvalue < 0.001:
                outstr = '*{}\t{}\tr={:3.3f}\tp={:3.3f}\tpr={:3.3f}\tpp={:3.3f}'.format(
                    c, s, corr.correlation, corr.pvalue, pcorr[0], pcorr[1])
                plt.clf()
                plt.scatter(cx, sy)
                plt.title('sr={:3.3f} sp={:3.3f}\npr={:3.3f} pp={:3.3f}'.format(
                        corr.correlation, corr.pvalue,
                        pcorr[0], pcorr[1],
                        ))
                plt.xlabel(c)
                plt.ylabel(s)
                plt.savefig(os.path.join(args.dst, '{}_{}.png'.format(c, s)), bbox_inches='tight')
            else:
                outstr = ' {}\t{}\tr={:3.3f}\tp={:3.3f}\tpr={:3.3f}\tpp={:3.3f}'.format(
                    c, s, corr.correlation, corr.pvalue, pcorr[0], pcorr[1])

            print(outstr)

    _,  qvalues,  _, _ = multipletests(pvalues, alpha=0.01, method='fdr_bh')

    qdf = pd.DataFrame({'q': qvalues, 'p': pvalues}, index=comparison_ids)
    qdf.sort_values('q', inplace=True)
    qdf.to_csv(logfile)
Ejemplo n.º 3
0
def main(args):
    feat = pd.read_csv(args.feature_src, index_col=None)
    case_ids = feat['case_id']
    tile_ids = feat['tile_id']
    stages = feat['stage_str']
    feat.drop(['case_id', 'tile_id', 'stage_str'], axis=1, inplace=True)
    feat.drop([c for c in feat.columns if 'Unnamed' in c],
              axis=1,
              inplace=True)

    feat = feat.sample(frac=args.pct)
    case_ids = case_ids.loc[feat.index]
    tile_ids = tile_ids.loc[feat.index]
    stages = stages.loc[feat.index]
    print(feat.shape)
    print(case_ids.shape)
    print(tile_ids.shape)

    feat = feat.loc[:, usecols]

    print('Dropping nan, inf and high corr')
    feat = drop_high_cor(feat, 0.8)
    feat = feat.transform(lambda x: (x - np.mean(x)) / np.std(x))
    feat = drop_nan_inf(feat)
    feat = drop_var(feat, 0.5)
    print(feat.shape)
    print(feat.head())

    if args.average == 'tile':
        print('Average by tile')
        feat = feat.groupby(by=tile_ids).mean()
        stages = stages.groupby(by=tile_ids).max()
        print(feat.shape)
    elif args.average == 'case':
        print('Average by case')
        feat = feat.groupby(by=case_ids).mean()
        stages = stages.groupby(by=case_ids).max()
        print(feat.shape)

    col_p = sns.color_palette('deep', 2)
    col_colors = [col_p[int('ae' in x)] for x in feat.columns]

    row_p = sns.color_palette('muted', 4)
    row_colors = []
    print(np.unique(stages.values))
    for s in stages.values:
        if s in m0_strs:
            row_colors.append(row_p[0])
        elif s in m1_strs:
            row_colors.append(row_p[1])
        elif 'NEPC' in s:
            row_colors.append(row_p[2])
        else:
            row_colors.append(row_p[3])

    print('col_colors', len(col_colors))
    print('row_colors', len(row_colors))

    # projected = TruncatedSVD(n_components=10).fit_transform(feat.values)
    # projected = PCA(n_components=10).fit_transform(feat.values)

    sns.clustermap(feat.values,
                   metric=args.metric,
                   standard_scale=1,
                   col_colors=col_colors,
                   row_colors=row_colors)
    plt.show()
Ejemplo n.º 4
0
def main(args):
    feat = pd.read_csv(args.src, index_col=0, header=0)
    labels = pd.read_csv(args.labsrc, index_col=0, header=0, sep='\t')
    # print(feat.head())
    # print(labels.head())

    case_ids = labels['case_id'].values
    tile_ids = labels.index.values
    stages = labels['stage_str'].values

    feat = drop_high_cor(feat, 0.8)
    print('Features after high cor drop')
    # print(feat.shape)
    # print(feat.head())

    feat = feat.transform(lambda x: (x - np.mean(x)) / np.std(x))
    print('Features after zscore')
    # print(feat.shape)
    # print(feat.head())

    feat = feat.fillna(value=0)
    # feat = drop_nan_inf(feat)
    # print('Features after dropping nan and infs')
    # print(feat.shape)
    # print(feat.head())

    ((nepc_f, nepc_lab), (m0_f, m0_lab), (m0p_f, m0p_lab),
     (m1_f, m1_lab)) = split_sets(feat, labels)
    del feat

    if args.filter_stats:
        remove_cols = filter_stats(nepc_f, m0_f)
        nepc_f.drop(remove_cols, inplace=True, axis=1)
        m0_f.drop(remove_cols, inplace=True, axis=1)
        m0p_f.drop(remove_cols, inplace=True, axis=1)
        m1_f.drop(remove_cols, inplace=True, axis=1)

    train_x, train_y = make_training(m0_f, nepc_f)

    print('train_x', train_x.shape)
    print('train_y', train_y.shape)
    print('m1_f', m1_f.shape)
    # model = ElasticNet(alpha=1e-3, max_iter=10000).fit(train_x, train_y)
    # model = ElasticNetCV(cv=25).fit(train_x, train_y)
    # model = ElasticNetCV(alphas=np.arange(1e-5, 1e-1, 20),
    #   cv=10, max_iter=20000, n_jobs=-1).fit(train_x, train_y)

    model = RandomForestRegressor(oob_score=True,
                                  max_features='sqrt',
                                  max_depth=20,
                                  n_estimators=50,
                                  n_jobs=-1).fit(train_x, train_y)

    with open('feature_importance.txt', 'w+') as f:
        for v, coef in zip(train_x.columns, model.feature_importances_):
            f.write('{}\t{}\n'.format(v, coef))

    if args.aggr_fn == 'max':
        aggr_fn = np.max
    elif args.aggr_fn == 'mean':
        aggr_fn = np.mean
    """ Predict the M1 cases and gather by max and mean """
    yhat_m1 = model.predict(m1_f)
    case_aggr = []
    m1_case_numbers = []
    m1_case_vect = m1_lab['case_id'].values
    for uc in np.unique(m1_case_vect):
        yx = yhat_m1[m1_case_vect == uc]
        case_aggr.append(aggr_fn(yx))
        case_num = int(uc.split('-')[1])
        m1_case_numbers.append(case_num)
    m1_case_aggr = np.array(case_aggr)
    m1_case_numbers = np.array(m1_case_numbers)
    """ Predict M0P cases """
    yhat_m0p = model.predict(m0p_f)
    case_aggr = []
    m0p_case_numbers = []
    m0p_case_vect = m0p_lab['case_id'].values
    for uc in np.unique(m0p_case_vect):
        yx = yhat_m0p[m0p_case_vect == uc]
        case_aggr.append(aggr_fn(yx))
        case_num = int(uc.split('-')[1])
        m0p_case_numbers.append(case_num)
    m0p_case_aggr = np.array(case_aggr)
    m0p_case_numbers = np.array(m0p_case_numbers)
    """ Check on the training data """
    # yhat_train = []
    # # Just do m0 and nepc separately
    # for cid in np.unique(m0_lab['case_id'].values):
    #   feat_case, feat_other = split_case(m0_f, m0_lab, cid)
    #   feat_split = pd.concat([feat_other, nepc_f])
    #   y_split = [0]*feat_other.shape[0] + [1]*nepc_f.shape[0]
    #   model = RandomForestRegressor(n_estimators=100, n_jobs=-1).fit(feat_split, y_split)
    #   yh = model.predict(feat_case)
    #   print(cid, yh)
    #   yhat_train += list(yh)
    # for cid in np.unique(nepc_lab['case_id'].values):
    #   feat_case, feat_other = split_case(nepc_f, nepc_lab, cid)
    #   feat_split = pd.concat([m0_f, feat_other])
    #   y_split = [0]*m0_f.shape[0] + [1]*feat_other.shape[0]
    #   model = RandomForestRegressor(n_estimators=100, n_jobs=-1).fit(feat_split, y_split)
    #   yh = model.predict(feat_case)
    #   print(cid, yh)
    #   yhat_train += list(yh)
    # yhat_train = np.asarray(yhat_train)
    # print(yhat_train.shape)

    m0_cases = m0_lab['case_id'].values
    nepc_cases = nepc_lab['case_id'].values
    train_case_vect = np.concatenate([m0_cases, nepc_cases])
    # yhat_train = model.predict(train_x)
    yhat_train = model.oob_prediction_
    train_aggr, train_case_y = [], []
    for uc in np.unique(train_case_vect):
        idx = train_case_vect == uc
        train_aggr.append(aggr_fn(yhat_train[idx]))
        train_case_y.append(train_y[idx][0])
    train_aggr = np.array(train_aggr)
    train_case_y = np.array(train_case_y)
    """ Do some statistical tests """
    dotest = mannwhitneyu
    # test_args = {'equal_var': True}
    test_args = {}
    test_m0_m1 = dotest(yhat_train[train_y == 0], yhat_m1, **test_args)
    test_m0_m0p = dotest(yhat_train[train_y == 0], yhat_m0p, **test_args)
    test_m0_nepc = dotest(yhat_train[train_y == 0], yhat_train[train_y == 1],
                          **test_args)
    test_nepc_m1 = dotest(yhat_train[train_y == 1], yhat_m1, **test_args)
    print('Tiles M0 vs M1', test_m0_m1)
    print('Tiles M0 vs M0P', test_m0_m0p)
    print('Tiles M0 vs NPEC', test_m0_nepc)
    print('Tiles NEPC vs M1', test_nepc_m1)

    test_m0_m1 = dotest(train_aggr[train_case_y == 0], m1_case_aggr,
                        **test_args)
    test_m0_m0p = dotest(train_aggr[train_case_y == 0], m0p_case_aggr,
                         **test_args)
    test_m0_nepc = dotest(train_aggr[train_case_y == 0],
                          train_aggr[train_case_y == 1], **test_args)
    test_nepc_m1 = dotest(train_aggr[train_case_y == 1], m1_case_aggr,
                          **test_args)
    print('aggr M0 vs M1', test_m0_m1)
    print('aggr M0 vs M0P', test_m0_m0p)
    print('aggr M0 vs NPEC', test_m0_nepc)
    print('aggr NEPC vs M1', test_nepc_m1)

    print(
        '------------------------------------------------------------------------------------'
    )
    gene_scores = pd.read_csv('../data/signature_scores_beltram.csv',
                              index_col=None,
                              header=0,
                              sep=',')
    gene_score_caseid = []
    drop_rows = []
    matching_scores = []
    matching_indices = []
    for i, (idx, sn) in enumerate(
            zip(gene_scores.index.values,
                gene_scores['Surgical Number'].values)):
        try:
            x = int(sn.split(' ')[-1])
            if x in m1_case_numbers:
                gene_score_caseid.append(x)
                matching_indices.append(idx)
                matching_scores.append(m1_case_aggr[m1_case_numbers == x][0])
            elif x in m0p_case_numbers:
                gene_score_caseid.append(x)
                matching_indices.append(idx)
                matching_scores.append(m0p_case_aggr[m0p_case_numbers == x][0])
            else:
                drop_rows.append(idx)
        except:
            drop_rows.append(idx)

    gene_scores.drop(drop_rows, inplace=True)
    gene_scores['NEPC Score'] = pd.Series(matching_scores,
                                          index=matching_indices)

    # if args.save_scores:
    # gene_scores.to_csv('../signature_scores_nepc_scores_nuclei_mean.csv')

    label_cols = ['caseid', 'Disease Stage', 'sample name', 'Surgical Number']
    gene_scores.drop(label_cols, inplace=True, axis=1)

    plt.figure(figsize=(5, 5), dpi=300)
    sns.pairplot(gene_scores, kind='reg')
    if args.dry_run:
        pass
    else:
        plt.savefig('gene_scores_nepc_score_{}_tile.png'.format(args.aggr_fn),
                    bbox_inches='tight')

    test_cols = [x for x in gene_scores.columns if x != 'NEPC Score']
    scores = gene_scores['NEPC Score'].values
    print(
        '------------------------------------------------------------------------------------'
    )
    for c in test_cols:
        ctest = spearmanr(scores, gene_scores[c].values)
        print('spearman {:40}: {:3.5f} p={:3.5f}'.format(
            c, ctest.correlation, ctest.pvalue))
        ctest = pearsonr(scores, gene_scores[c].values)
        print('pearson  {:40}: {:3.5f} p={:3.5f}'.format(
            c, ctest[0], ctest[1]))

    print(
        '------------------------------------------------------------------------------------'
    )
    if args.boxplot:
        f, (ax_box,
            ax_hist) = plt.subplots(2,
                                    sharex=True,
                                    gridspec_kw={"height_ratios": (.35, .65)})
        plt_m0 = train_aggr[train_case_y == 0]
        plt_nepc = train_aggr[train_case_y == 1]
        plt_m1 = m1_case_aggr
        plt_m0p = m0p_case_aggr
        sns.distplot(
            plt_m0,
            bins=25,
            norm_hist=True,
            kde=True,
            label='M0',
            ax=ax_hist,
        )
        sns.distplot(
            plt_nepc,
            bins=25,
            norm_hist=True,
            kde=True,
            label='NEPC',
            ax=ax_hist,
        )
        sns.distplot(
            plt_m1,
            kde=True,
            norm_hist=True,
            bins=25,
            label='M1',
            ax=ax_hist,
        )
        sns.distplot(
            plt_m0p,
            kde=True,
            norm_hist=True,
            bins=25,
            label='M0-P',
            ax=ax_hist,
        )
        ax_hist.set_xlabel('Score')
        ax_hist.set_ylabel('Frequency')
        concat_scores = np.concatenate([plt_m0, plt_nepc, plt_m1, plt_m0p])
        concat_labels = np.array(['M0'] * len(plt_m0) +
                                 ['NEPC'] * len(plt_nepc) +
                                 ['M1'] * len(plt_m1) + ['M0P'] * len(plt_m0p))
        plt_df = pd.DataFrame({'Set': concat_labels, 'Score': concat_scores})

        # fig = plt.figure(figsize=(2,2), dpi=300)
        sns.boxplot(y='Set', x='Score', data=plt_df, ax=ax_box)
        sns.stripplot(y='Set',
                      x='Score',
                      data=plt_df,
                      size=2.5,
                      jitter=True,
                      linewidth=0.5,
                      ax=ax_box)
        # ax_box.set_ylabel('')
        # ax_box.set_xlabel('')
        # plt.show()
        if args.dry_run:
            pass
        else:
            plt.savefig('NEPC_score_{}_tile.png'.format(args.aggr_fn),
                        bbox_inches='tight')
Ejemplo n.º 5
0
def main(args):
    data = pd.read_csv(args.src, index_col=0, memory_map=True)
    lab = pd.read_csv(args.lab, index_col=0)
    print('DATA')
    print(data.shape)
    print('LAB')
    print(lab.shape)
    print(lab.head())

    data = data.sample(frac=args.pct)
    print(data.shape)
    # print(data.head())

    # Grab the id columns
    case_ids = data['case_id'].values
    tile_ids = data['tile_id'].values
    data.drop(['case_id', 'tile_id'], inplace=True, axis=1)
    print(data.shape)
    # print(data.head())

    data = drop_high_cor(data, cor_thresh=0.7)
    print('Features after high cor drop')
    print(data.head())

    lab_case_uid = np.array(
        [hashlib.md5(x.encode()).hexdigest() for x in lab['case_id'].values])
    is_nepc = np.zeros_like(case_ids, dtype=np.bool)
    not_nepc = np.zeros_like(case_ids, dtype=np.bool)
    for t_id in np.unique(case_ids):
        t_idx = case_ids == t_id
        print('{}: {} {}'.format(t_id, t_idx.shape, t_idx.sum()))

        assert t_id in lab_case_uid
        t_label = lab.loc[lab_case_uid == t_id].values
        t_label = t_label[0, -3]

        if t_label == 'NEPC':
            is_nepc[t_idx] = 1
        else:
            not_nepc[t_idx] = 1

    nepc_case_feat = data.loc[is_nepc, :].values
    adeno_case_feat = data.loc[not_nepc, :].values

    # nepc_case_feat = nepc_case_feat.sample(n=args.nsample).values
    # adeno_case_feat = adeno_case_feat.sample(n=args.nsample).values

    print('NEPC features:')
    print(nepc_case_feat.shape)
    print('Adeno features:')
    print(adeno_case_feat.shape)

    for c in range(nepc_case_feat.shape[1]):
        nepc_ = nepc_case_feat[:, c]
        adeno_ = adeno_case_feat[:, c]
        tt = ttest_ind(nepc_, adeno_)
        print('{}\t{:3.3f}\t{:3.3f}'.format(c, tt[0], tt[1]))
        if tt[1] < args.thresh:
            plt.clf()
            # df = pd.DataFrame({'NEPC': nepc_,
            #                    'Adeno': adeno_})
            sns.distplot(nepc_, label='NEPC')
            sns.distplot(adeno_, label='Adeno')
            plt.legend(frameon=True)
            plt.title('{}\np={}'.format(c, tt[1]))

            saveto = os.path.join(args.dst, '{}.png'.format(c))
            plt.savefig(saveto, bbox_inches='tight')
def main(args):
    feat, case_ids = load_features(args.src, zscore=True)
    lab = load_labels(args.labsrc)

    feat = drop_high_cor(feat, cor_thresh=0.8)
    print('Features after high cor drop')

    # train_x, train_y, test_x, test_y = holdout_cases(feat, lab)
    ((nepc_f, nepc_lab), (m0_f, m0_lab), (m0p_f, m0p_lab),
     (m1_f, m1_lab)) = split_sets(feat, lab)
    del feat

    # Split out non-small-cell-NEPC:
    nepc_is_sc = np.array([x in scnepc for x in nepc_lab['case_id'].values])
    nepc_not_sc = np.array(
        [x not in scnepc for x in nepc_lab['case_id'].values])
    nepc_f_sc = nepc_f.loc[nepc_is_sc, :]
    nepc_lab_sc = nepc_lab.loc[nepc_is_sc, :]

    nepc_f_not_sc = nepc_f.loc[nepc_not_sc, :]
    nepc_lab_not_sc = nepc_lab.loc[nepc_not_sc, :]
    del nepc_f, nepc_lab

    print('NEPC SC lab')
    print(nepc_lab_sc.head())
    print(nepc_lab_sc.shape)
    print('NEPC not SC lab')
    print(nepc_lab_not_sc.head())
    print(nepc_lab_not_sc.shape)

    if args.filter_stats:
        remove_cols = filter_stats(nepc_f_sc, m0_f)
        nepc_f_sc.drop(remove_cols, inplace=True, axis=1)
        nepc_f_not_sc.drop(remove_cols, inplace=True, axis=1)
        m0_f.drop(remove_cols, inplace=True, axis=1)
        m0p_f.drop(remove_cols, inplace=True, axis=1)
        m1_f.drop(remove_cols, inplace=True, axis=1)

    train_x, train_y = make_training(m0_f, nepc_f_sc)
    train_lab = pd.concat([m0_lab, nepc_lab_sc], axis=0)
    print('train lab')
    print(train_lab.head())
    print(train_lab.shape)

    # model = ElasticNet(alpha=1e-3, max_iter=50000).fit(train_x, train_y)
    # model = ElasticNetCV(cv=25).fit(train_x, train_y)
    # model = ElasticNetCV(alphas=np.arange(1e-5, 1e-1, 20),
    #   cv=10, max_iter=10000, n_jobs=-1).fit(train_x, train_y)

    model = RandomForestRegressor(oob_score=True,
                                  max_depth=25,
                                  n_estimators=100,
                                  n_jobs=-1).fit(train_x, train_y)

    with open('feature_importance.txt', 'w+') as f:
        for v, coef in zip(train_x.columns, model.feature_importances_):
            f.write('{}\t{}\n'.format(v, coef))

    if args.aggr_fn == 'max':
        aggr_fn = np.max
    elif args.aggr_fn == 'mean':
        aggr_fn = np.mean

    # """ Get M0 case numbers """
    # m0_case_numbers = []
    # m0_case_vect = m1_lab['case_id'].values
    # print('M0 Cases:')
    # for uc in np.unique(m0_case_vect):
    #   case_num = int(uc.plist('-')[1])
    #   m0_case_numbers.append(case_num)
    """ Predict the M1 cases and gather by mean """
    yhat_m1 = model.predict(m1_f)
    case_aggr = []
    m1_case_numbers = []
    m1_case_vect = m1_lab['case_id'].values
    for uc in np.unique(m1_case_vect):
        yx = yhat_m1[m1_case_vect == uc]
        case_aggr.append(aggr_fn(yx))
        case_num = int(uc.split('-')[1])
        m1_case_numbers.append(case_num)
    m1_case_aggr = np.array(case_aggr)
    m1_case_numbers = np.array(m1_case_numbers)

    # Print out
    m1_lab['NEPC_score'] = yhat_m1
    print('m1 lab')
    print(m1_lab.head())
    """ Predict M0P cases """
    yhat_m0p = model.predict(m0p_f)
    case_aggr = []
    m0p_case_numbers = []
    m0p_case_vect = m0p_lab['case_id'].values
    for uc in np.unique(m0p_case_vect):
        yx = yhat_m0p[m0p_case_vect == uc]
        case_aggr.append(aggr_fn(yx))
        case_num = int(uc.split('-')[1])
        m0p_case_numbers.append(case_num)
    m0p_case_aggr = np.array(case_aggr)
    m0p_case_numbers = np.array(m0p_case_numbers)

    # Print out
    m0p_lab['NEPC_score'] = yhat_m0p
    print('m0p lab')
    print(m0p_lab.head())
    """ Predict NEPC not SC cases """
    yhat_nepc_not_sc = model.predict(nepc_f_not_sc)
    case_aggr = []
    nepc_not_sc_case_numbers = []
    nepc_not_sc_case_vect = nepc_lab_not_sc['case_id'].values
    for uc in np.unique(nepc_not_sc_case_vect):
        yx = yhat_nepc_not_sc[nepc_not_sc_case_vect == uc]
        case_aggr.append(aggr_fn(yx))
    nepc_not_sc_case_aggr = np.array(case_aggr)
    nepc_not_sc_case_numbers = np.array(nepc_not_sc_case_numbers)

    # Print out
    nepc_lab_not_sc['NEPC_score'] = yhat_nepc_not_sc
    print('NEPC not sc lab')
    print(nepc_lab_not_sc.head())
    """ Check on training data
  Run a LOOCV on the training data """

    # yhat_train = []
    # # Just do m0 and nepc separately
    # for cid in np.unique(m0_lab['case_id'].values):
    #   feat_case, feat_other = split_case(m0_f, m0_lab, cid)
    #   feat_split = pd.concat([feat_other, nepc_f])
    #   y_split = [0]*feat_other.shape[0] + [1]*nepc_f.shape[0]
    #   model = RandomForestRegressor(n_estimators=100, n_jobs=-1).fit(feat_split, y_split)
    #   yh = model.predict(feat_case)
    #   print(cid, yh)
    #   yhat_train += list(yh)
    # for cid in np.unique(nepc_lab['case_id'].values):
    #   feat_case, feat_other = split_case(nepc_f, nepc_lab, cid)
    #   feat_split = pd.concat([m0_f, feat_other])
    #   y_split = [0]*m0_f.shape[0] + [1]*feat_other.shape[0]
    #   model = RandomForestRegressor(n_estimators=100, n_jobs=-1).fit(feat_split, y_split)
    #   yh = model.predict(feat_case)
    #   print(cid, yh)
    #   yhat_train += list(yh)
    # yhat_train = np.asarray(yhat_train)
    # print(yhat_train.shape)

    m0_cases = m0_lab['case_id'].values
    nepc_cases = nepc_lab_sc['case_id'].values
    train_case_vect = np.concatenate([m0_cases, nepc_cases])
    # yhat_train = model.predict(train_x)
    yhat_train = model.oob_prediction_
    train_aggr, train_case_y = [], []
    for uc in np.unique(train_case_vect):
        idx = train_case_vect == uc
        train_aggr.append(aggr_fn(yhat_train[idx]))
        train_case_y.append(train_y[idx][0])
    train_aggr = np.array(train_aggr)
    train_case_y = np.array(train_case_y)

    # Print out
    train_lab['NEPC_score'] = yhat_train
    print('train lab')
    print(train_lab.head())

    score_lab = pd.concat([m1_lab, m0p_lab, train_lab], axis=0)
    print(score_lab.shape)
    score_lab.to_csv('tile_paths_with_NEPC_score.csv')
    """ write out scores """
    with open('nepc_case_scores.txt', 'w+') as f:
        for mop, mop_score in zip(np.unique(m0p_case_vect), m0p_case_aggr):
            s = '{}\t{}\n'.format(mop, mop_score)
            f.write(s)

        for mop, mop_score in zip(np.unique(m1_case_vect), m1_case_aggr):
            s = '{}\t{}\n'.format(mop, mop_score)
            f.write(s)

        for mop, mop_score in zip(np.unique(train_case_vect), train_aggr):
            s = '{}\t{}\n'.format(mop, mop_score)
            f.write(s)
    """ Do some statistical tests """
    dotest = mannwhitneyu
    # test_args = {'equal_var': True}
    test_args = {}
    test_m0_m1 = dotest(yhat_train[train_y == 0], yhat_m1, **test_args)
    test_m0_m0p = dotest(yhat_train[train_y == 0], yhat_m0p, **test_args)
    test_m0_nepc = dotest(yhat_train[train_y == 0], yhat_train[train_y == 1],
                          **test_args)
    test_nepc_m1 = dotest(yhat_train[train_y == 1], yhat_m1, **test_args)
    test_m0_nepc_not_sc = dotest(yhat_train[train_y == 0], yhat_nepc_not_sc,
                                 **test_args)
    test_nepc_sc_nepc_not_sc = dotest(yhat_train[train_y == 1],
                                      yhat_nepc_not_sc, **test_args)
    print('Tiles M0 vs M1', test_m0_m1)
    print('Tiles M0 vs M0P', test_m0_m0p)
    print('Tiles M0 vs NPEC SC', test_m0_nepc)
    print('Tiles M0 vs NPEC NOT SC', test_m0_nepc_not_sc)
    print('Tiles NEPC vs M1', test_nepc_m1)
    print('Tiles NEPC SC vs NEPC NOT SC', test_nepc_sc_nepc_not_sc)

    test_m0_m1 = dotest(train_aggr[train_case_y == 0], m1_case_aggr,
                        **test_args)
    test_m0_m0p = dotest(train_aggr[train_case_y == 0], m0p_case_aggr,
                         **test_args)
    test_m0_nepc = dotest(train_aggr[train_case_y == 0],
                          train_aggr[train_case_y == 1], **test_args)
    test_nepc_m1 = dotest(train_aggr[train_case_y == 1], m1_case_aggr,
                          **test_args)
    test_m0_nepc_not_sc = dotest(train_aggr[train_case_y == 0],
                                 nepc_not_sc_case_aggr, **test_args)
    test_nepc_sc_nepc_not_sc = dotest(train_aggr[train_case_y == 1],
                                      nepc_not_sc_case_aggr, **test_args)
    print('aggr M0 vs M1', test_m0_m1)
    print('aggr M0 vs M0P', test_m0_m0p)
    print('aggr M0 vs NPEC SC', test_m0_nepc)
    print('aggr M0 vs NPEC NOT SC', test_m0_nepc_not_sc)
    print('aggr NEPC vs M1', test_nepc_m1)
    print('aggr NPEC SC vs NEPC NOT SC', test_nepc_sc_nepc_not_sc)

    print(
        '------------------------------------------------------------------------------------'
    )
    if args.genescore:
        gene_scores = pd.read_csv('../data/signature_scores_beltram.csv',
                                  index_col=None,
                                  header=0,
                                  sep=',')
        gene_score_caseid = []
        drop_rows = []
        matching_scores = []
        matching_indices = []
        for i, (idx, sn) in enumerate(
                zip(gene_scores.index.values,
                    gene_scores['Surgical Number'].values)):
            try:
                x = int(sn.split(' ')[-1])
                if x in m1_case_numbers:
                    # print('M1 matched SN {}'.format(x))
                    gene_score_caseid.append(x)
                    matching_indices.append(idx)
                    matching_scores.append(
                        m1_case_aggr[m1_case_numbers == x][0])
                # if x in m0_case_numbers:
                #   print('M0 matched SN {}'.format(x))
                #   gene_score_caseid.append(x)
                #   matching_indices.append(idx)
                #   matching_scores.append(m1_case_mean[m1_case_numbers==x][0])
                elif x in m0p_case_numbers:
                    # print('M0P matched SN {}'.format(x))
                    gene_score_caseid.append(x)
                    matching_indices.append(idx)
                    matching_scores.append(
                        m0p_case_aggr[m0p_case_numbers == x][0])
                else:
                    drop_rows.append(idx)
            except:
                drop_rows.append(idx)
                print(sn)

        gene_scores.drop(drop_rows, inplace=True)
        print(gene_scores.shape)
        gene_scores['NEPC Score'] = pd.Series(matching_scores,
                                              index=matching_indices)

        # if args.save_scores:
        # gene_scores.to_csv('../data/signature_scores_nepc_scores_mean.csv')

        label_cols = [
            'caseid', 'Disease Stage', 'sample name', 'Surgical Number'
        ]
        gene_scores.drop(label_cols, inplace=True, axis=1)

        # plt.figure(figsize=(5,5), dpi=300)
        # sns.pairplot(gene_scores, kind='reg')
        # plt.savefig('gene_scores_nepc_score_{}.png'.format(args.aggr_fn), bbox_inches='tight')

        test_cols = [x for x in gene_scores.columns if x != 'NEPC Score']
        scores = gene_scores['NEPC Score'].values
        for c in test_cols:
            try:
                ctest = spearmanr(scores, gene_scores[c].values)
                print('spearman {:40}: {:3.5f} p={:3.5f}'.format(
                    c, ctest.correlation, ctest.pvalue))
                ctest = pearsonr(scores, gene_scores[c].values)
                print('pearson  {:40}: {:3.5f} p={:3.5f}'.format(
                    c, ctest[0], ctest[1]))
            except:
                print('Test column {} failed'.format(c))

    print(
        '------------------------------------------------------------------------------------'
    )
    if args.boxplot:
        f, (ax_box,
            ax_hist) = plt.subplots(2,
                                    sharex=True,
                                    gridspec_kw={"height_ratios": (.35, .65)})
        plt_m0 = train_aggr[train_case_y == 0]
        plt_nepc_sc = train_aggr[train_case_y == 1]
        plt_nepc_not_sc = nepc_not_sc_case_aggr
        plt_m1 = m1_case_aggr
        plt_m0p = m0p_case_aggr

        auc_ = roc_auc_score(y_true=train_case_y, y_score=train_aggr)
        print('M0 NEPC SC AUC = ', auc_)

        m0m1 = np.concatenate([plt_m0, plt_m1])
        m0m1_y = np.array([0] * len(plt_m0) + [1] * len(plt_m1))
        auc_ = roc_auc_score(y_true=m0m1_y, y_score=m0m1)
        print('M0 M1 AUC = ', auc_)

        m0m0p = np.concatenate([plt_m0, plt_m0p])
        m0m0p_y = np.array([0] * len(plt_m0) + [1] * len(plt_m0p))
        auc_ = roc_auc_score(y_true=m0m0p_y, y_score=m0m0p)
        print('M0 M0P AUC = ', auc_)

        m0nepc_not_sc = np.concatenate([plt_m0, plt_nepc_not_sc])
        m0nepc_not_sc_y = np.array([0] * len(plt_m0) +
                                   [1] * len(plt_nepc_not_sc))
        auc_ = roc_auc_score(y_true=m0nepc_not_sc_y, y_score=m0nepc_not_sc)
        print('M0 NEPC not SC AUC = ', auc_)

        sns.distplot(
            plt_m0,
            bins=25,
            norm_hist=True,
            kde=True,
            label='M0',
            ax=ax_hist,
        )
        sns.distplot(
            plt_nepc_sc,
            bins=25,
            norm_hist=True,
            kde=True,
            label='NEPC SC',
            ax=ax_hist,
        )
        sns.distplot(
            plt_m1,
            kde=True,
            norm_hist=True,
            bins=25,
            label='M1',
            ax=ax_hist,
        )
        sns.distplot(
            plt_m0p,
            kde=True,
            norm_hist=True,
            bins=25,
            label='M0-P',
            ax=ax_hist,
        )
        sns.distplot(
            plt_nepc_not_sc,
            kde=True,
            norm_hist=True,
            bins=25,
            label='NEPC not SC',
            ax=ax_hist,
        )
        ax_hist.set_xlabel('Score')
        ax_hist.set_ylabel('Frequency')
        concat_scores = np.concatenate(
            [plt_m0, plt_nepc_sc, plt_m1, plt_m0p, plt_nepc_not_sc])
        concat_labels = np.array(['M0'] * len(plt_m0) +
                                 ['NEPC SC'] * len(plt_nepc_sc) +
                                 ['M1'] * len(plt_m1) +
                                 ['M0P'] * len(plt_m0p) +
                                 ['NEPC not SC'] * len(plt_nepc_not_sc))

        plt_df = pd.DataFrame({'Set': concat_labels, 'Score': concat_scores})

        # fig = plt.figure(figsize=(2,2), dpi=300)
        sns.boxplot(y='Set', x='Score', data=plt_df, ax=ax_box)
        sns.stripplot(y='Set',
                      x='Score',
                      data=plt_df,
                      size=2.5,
                      jitter=True,
                      linewidth=0.5,
                      ax=ax_box)
        # ax_box.set_ylabel('')
        # ax_box.set_xlabel('')
        # plt.show()
        plt.savefig('NEPC_score_{}.png'.format(args.aggr_fn),
                    bbox_inches='tight')
Ejemplo n.º 7
0
def main(args):
    data = pd.read_csv(args.src, index_col=0, memory_map=True)
    lab = pd.read_csv(args.lab)
    print(data.shape)
    print(lab.shape)
    print(lab.head())

    data = data.sample(frac=args.pct)
    print(data.shape)
    print(data.head())

    # Grab the id columns
    case_id = data['case_id']
    tile_id = data['tile_id']
    data.drop(['case_id', 'tile_id'], inplace=True, axis=1)
    print(data.shape)
    print(data.head())

    if args.ae_only:
        to_drop = [x for x in data.columns if 'ae' not in x]
        data.drop(to_drop, axis=1, inplace=True)

    if args.hc_only:
        to_drop = [x for x in data.columns if 'hc' not in x]
        data.drop(to_drop, axis=1, inplace=True)

    data = data.transform(lambda x: (x - np.mean(x)) / np.std(x))

    isinfs = np.sum(np.isinf(data.values), axis=0)
    print('isinfs', isinfs.shape)
    isnans = np.sum(np.isnan(data.values), axis=0)
    print('isnans', isnans.shape)
    print(np.argwhere(isinfs))
    print(np.argwhere(isnans))
    # data = data.dropna(axis='index')
    inf_cols = data.columns.values[np.squeeze(np.argwhere(isinfs))]
    nan_cols = data.columns.values[np.squeeze(np.argwhere(isnans))]
    print('inf_cols', inf_cols)
    print('nan_cols', nan_cols)
    data.drop(inf_cols, axis=1, inplace=True)
    data.drop(nan_cols, axis=1, inplace=True)
    print(data.shape)

    # Drop correlated columns
    data = drop_high_cor(data, 0.7)

    if args.average:
        print('Averaging features')
        if args.average_by == 'case':
            print('by: case')
            data = data.groupby(by=case_id, group_keys=True).mean()
            lab = lab.groupby('case_id').max()
        elif args.average_by == 'tile':
            print('by: tile')
            data = data.groupby(by=tile_id, group_keys=True).mean()
            lab = lab.groupby('tile_id').max()
        else:
            pass

        print(data.shape)
        print(data.head())
        print(lab.head())

        is_nepc = []
        for x, t in zip(lab['stage_str'].values, lab.index.values):
            if t in data.index:
                is_nepc.append(x == 'NEPC')
        is_nepc = np.array(is_nepc)

        print(is_nepc.shape)
    else:
        pass

    emb = MulticoreTSNE(n_jobs=-1).fit_transform(data)
    # emb = umap.UMAP().fit_transform(data)

    plot_embedded(emb, is_nepc)
Ejemplo n.º 8
0
def main(args):
    feat = pd.read_csv(args.feature_src, index_col=0)
    lab  = pd.read_csv(args.label_src)
    case_ids = feat['case_id']
    tile_ids = feat.index
    stages   = lab['stage_str']
    feat.drop(['case_id'], axis=1, inplace=True)
    # feat.drop([c for c in feat.columns if 'Unnamed' in c], axis=1, inplace=True)

    # case_ids = case_ids.loc[feat.index]
    # tile_ids = tile_ids.loc[feat.index]
    # stages   = stages.loc[feat.index]
    print(feat.shape)
    print(case_ids.shape)
    print(tile_ids.shape)
    print(stages.shape)

    print('Dropping nan, inf and high corr')
    feat = drop_high_cor(feat, 0.8)
    feat = feat.transform(lambda x: (x - np.mean(x)) / np.std(x))

    if os.path.exists(args.reject_feats):
        usecols = np.invert(np.load(args.reject_feats))
        print('Rejecting features', args.reject_feats, usecols.shape, np.sum(usecols))
        feat = feat.loc[:, usecols]
    else:
        feat = feat.loc[:, usecols]

    feat = drop_nan_inf(feat)
    feat = drop_var(feat, 0.5)
    print(feat.shape)
    print(feat.head())

    if args.average:
        print('Average by case')
        feat = feat.groupby(by=case_ids.values).mean()
        stages   = stages.groupby(by=case_ids.values).max()

    print(feat.shape)
    print(stages.shape)

    row_p = sns.color_palette('muted', 3)
    row_colors = []
    print(np.unique(stages.values))
    for s in stages.values:
        if s in m0_strs:
            row_colors.append(row_p[0])
        elif s in m1_strs:
            row_colors.append(row_p[1])
        elif 'NEPC' in s:
            row_colors.append(row_p[2])
        else:
            row_colors.append(row_p[1])
    
    print('row_colors', len(row_colors))

    # projected = TruncatedSVD(n_components=10).fit_transform(feat.values)
    # projected = PCA(n_components=10).fit_transform(feat.values)

    sns.clustermap(feat.values, 
                   metric=args.metric, 
                   standard_scale=1,
                   row_colors=row_colors)

    plt.savefig(args.dst)
Ejemplo n.º 9
0
def train(args):
    feat = pd.read_csv(args.src, index_col=0, header=0)
    print(feat.head())
    print(feat.shape)
    labels = pd.read_csv(args.labsrc, sep='\t')
    print(labels.shape)

    yvect = get_y(feat['case_id'], labels)
    print(yvect.shape)

    # Drop rows that come from cases we want to exclude
    usable_data = yvect != 4
    yvect = yvect[usable_data]
    print(yvect.shape)

    feat = feat.loc[usable_data, :]
    nuclei_case_ids = feat['case_id']
    nuclei_tile_ids = feat['tile_id']
    feat.drop(['case_id', 'tile_id'], axis=1, inplace=True)
    print('dropped label cols', feat.shape)

    # drop_cols = [x for x in feat.columns if 'hc' not in x]
    # feat.drop(drop_cols, inplace=True, axis=1)
    # print('dropped chosen cols', feat.shape)

    # Drop columns of features
    feat = drop_var(feat)
    print('dropped low var', feat.shape)

    feat = drop_high_cor(feat, 0.8)
    print('dropped corr', feat.shape)

    feat = drop_nan_inf(feat)
    print('dropped nan inf', feat.shape)

    feat = feat.transform(lambda x: (x - np.mean(x)) / np.std(x))
    print(feat.head())
    print(feat.shape)

    # Split off M1
    m1rows = yvect == 2
    nepc_not_sc_rows = yvect == 5
    m0nepc_rows = yvect < 2
    yvect_m0nepc = yvect[m0nepc_rows]
    feat_m0nepc = feat.loc[m0nepc_rows, :]
    feat_m1 = feat.loc[m1rows, :]
    feat_nepc_not_sc = feat.loc[nepc_not_sc_rows, :]
    del feat, yvect

    train_idx, test_idx = train_test_split(np.arange(len(yvect_m0nepc)))
    train_x = feat_m0nepc.iloc[train_idx, :]
    train_y = yvect_m0nepc[train_idx]
    test_x = feat_m0nepc.iloc[test_idx, :]
    test_y = yvect_m0nepc[test_idx]
    print(train_x.shape)
    print(test_x.shape)
    model = RandomForestRegressor(max_depth=35,
                                  max_features='sqrt',
                                  n_estimators=200,
                                  n_jobs=-1).fit(train_x, train_y)

    #ypred = model.predict(test_x)
    #print(ypred.shape)
    #print(ypred.mean())
    #print(ypred)

    #m1pred = model.predict(feat_m1)
    #nepc_not_sc_pred = model.predict(feat_nepc_not_sc)

    #plt_m0 = ypred[test_y == 0]
    #plt_nepc = ypred[test_y == 1]
    #plt_m1 = m1pred
    #plt_nepc_not_sc = nepc_not_sc_pred
    #do_boxplot(plt_m0, plt_nepc, plt_m1, plt_m0p, plt_nepc_not_sc, args.figout)

    dump(model, args.save)
    np.save('nucleus_classifier_features.npy', train_x.columns.values)
def main(args):
    feat = pd.read_csv(args.src, index_col=0, header=0)
    labels = pd.read_csv(args.labsrc, index_col=0, header=0, sep='\t')

    feat.drop('case_id', axis=1, inplace=True)

    use_rows = feat['n_score'].values != 0
    feat = feat.iloc[use_rows, :]
    labels = labels.iloc[use_rows, :]
    print('using tables:', feat.shape, labels.shape)

    case_ids = labels['case_id'].values
    tile_ids = labels.index.values
    stages = labels['stage_str'].values

    feat = drop_high_cor(feat, 0.8)
    print('Features after high cor drop')

    feat = feat.transform(lambda x: (x - np.mean(x)) / np.std(x))
    print('Features after zscore')

    feat = drop_nan_inf(feat)
    print('Features after dropping nan and infs')

    ((nepc_f, nepc_lab), (m0_f, m0_lab), (m0p_f, m0p_lab),
     (m1_f, m1_lab)) = split_sets(feat, labels)
    del feat

    if args.filter_stats:
        remove_cols = filter_stats(nepc_f, m0_f)
        nepc_f.drop(remove_cols, inplace=True, axis=1)
        m0_f.drop(remove_cols, inplace=True, axis=1)
        m0p_f.drop(remove_cols, inplace=True, axis=1)
        m1_f.drop(remove_cols, inplace=True, axis=1)

    train_x, train_y = make_training(m0_f, nepc_f)
    print('train_x', train_x.shape)
    print('train_y', train_y.shape)
    print('m1_f', m1_f.shape)

    model = RandomForestRegressor(oob_score=True,
                                  max_depth=20,
                                  max_features='sqrt',
                                  n_estimators=150,
                                  n_jobs=-1).fit(train_x, train_y)

    with open('feature_importance.txt', 'w+') as f:
        for v, coef in zip(train_x.columns, model.feature_importances_):
            f.write('{}\t{}\n'.format(v, coef))

    if args.aggr_fn == 'max':
        aggr_fn = np.max
    elif args.aggr_fn == 'mean':
        aggr_fn = np.mean
    """ Predict the M1 cases and gather by max and mean """
    yhat_m1 = model.predict(m1_f)
    case_aggr = []
    m1_case_numbers = []
    m1_case_vect = m1_lab['case_id'].values
    for uc in np.unique(m1_case_vect):
        yx = yhat_m1[m1_case_vect == uc]
        case_aggr.append(aggr_fn(yx))
        case_num = int(uc.split('-')[1])
        m1_case_numbers.append(case_num)
    m1_case_aggr = np.array(case_aggr)
    m1_case_numbers = np.array(m1_case_numbers)
    """ Predict M0P cases """
    yhat_m0p = model.predict(m0p_f)
    case_aggr = []
    m0p_case_numbers = []
    m0p_case_vect = m0p_lab['case_id'].values
    for uc in np.unique(m0p_case_vect):
        yx = yhat_m0p[m0p_case_vect == uc]
        case_aggr.append(aggr_fn(yx))
        case_num = int(uc.split('-')[1])
        m0p_case_numbers.append(case_num)
    m0p_case_aggr = np.array(case_aggr)
    m0p_case_numbers = np.array(m0p_case_numbers)
    """ Check on the training data """
    m0_cases = m0_lab['case_id'].values
    nepc_cases = nepc_lab['case_id'].values
    train_case_vect = np.concatenate([m0_cases, nepc_cases])
    # yhat_train = model.predict(train_x)
    yhat_train = model.oob_prediction_
    train_aggr, train_case_y = [], []
    for uc in np.unique(train_case_vect):
        idx = train_case_vect == uc
        train_aggr.append(aggr_fn(yhat_train[idx]))
        train_case_y.append(train_y[idx][0])
    train_aggr = np.array(train_aggr)
    train_case_y = np.array(train_case_y)
    """ write out scores """
    with open('nepc_case_scores.txt', 'w+') as f:
        for mop, mop_score in zip(np.unique(m0p_case_vect), m0p_case_aggr):
            s = '{}\t{}\n'.format(mop, mop_score)
            f.write(s)

        for mop, mop_score in zip(np.unique(m1_case_vect), m1_case_aggr):
            s = '{}\t{}\n'.format(mop, mop_score)
            f.write(s)

        for mop, mop_score in zip(np.unique(train_case_vect), train_aggr):
            s = '{}\t{}\n'.format(mop, mop_score)
            f.write(s)
    """ Do some statistical tests """
    dotest = mannwhitneyu
    # test_args = {'equal_var': True}
    test_args = {}
    test_m0_m1 = dotest(yhat_train[train_y == 0], yhat_m1, **test_args)
    test_m0_m0p = dotest(yhat_train[train_y == 0], yhat_m0p, **test_args)
    test_m0_nepc = dotest(yhat_train[train_y == 0], yhat_train[train_y == 1],
                          **test_args)
    test_nepc_m1 = dotest(yhat_train[train_y == 1], yhat_m1, **test_args)
    print('Tiles M0 vs M1', test_m0_m1)
    print('Tiles M0 vs M0P', test_m0_m0p)
    print('Tiles M0 vs NPEC', test_m0_nepc)
    print('Tiles NEPC vs M1', test_nepc_m1)

    test_m0_m1 = dotest(train_aggr[train_case_y == 0], m1_case_aggr,
                        **test_args)
    test_m0_m0p = dotest(train_aggr[train_case_y == 0], m0p_case_aggr,
                         **test_args)
    test_m0_nepc = dotest(train_aggr[train_case_y == 0],
                          train_aggr[train_case_y == 1], **test_args)
    test_nepc_m1 = dotest(train_aggr[train_case_y == 1], m1_case_aggr,
                          **test_args)
    print('aggr M0 vs M1', test_m0_m1)
    print('aggr M0 vs M0P', test_m0_m0p)
    print('aggr M0 vs NPEC', test_m0_nepc)
    print('aggr NEPC vs M1', test_nepc_m1)
    """ ROC - AUC """
    print(
        '------------------------------------------------------------------------------------'
    )
    m0nepc_ypred = np.concatenate(
        [train_aggr[train_case_y == 0], train_aggr[train_case_y == 1]])
    m0nepc_ytrue = np.array([0] * np.sum(train_case_y == 0) +
                            [1] * np.sum(train_case_y == 1))
    m0m1_ypred = np.concatenate([train_aggr[train_case_y == 0], m1_case_aggr])
    m0m1_ytrue = np.array([0] * np.sum(train_case_y == 0) +
                          [1] * len(m1_case_aggr))
    m0m0p_ypred = np.concatenate(
        [train_aggr[train_case_y == 0], m0p_case_aggr])
    m0m0p_ytrue = np.array([0] * np.sum(train_case_y == 0) +
                           [1] * len(m0p_case_aggr))
    print('m0nepc_ypred', m0nepc_ypred.shape, m0nepc_ytrue.shape)
    print('m0m1_ypred', m0m1_ypred.shape, m0m1_ypred.shape)
    print('m0m0p_ypred', m0m0p_ypred.shape, m0m0p_ypred.shape)

    auc_ = roc_auc_score(y_true=m0nepc_ytrue, y_score=m0nepc_ypred)
    print('M0 - NEPC AUC = ', auc_)

    auc_ = roc_auc_score(y_true=m0m1_ytrue, y_score=m0m1_ypred)
    print('M0 - M1 AUC = ', auc_)

    auc_ = roc_auc_score(y_true=m0m0p_ytrue, y_score=m0m0p_ypred)
    print('M0 - M0P AUC = ', auc_)

    print(
        '------------------------------------------------------------------------------------'
    )
    gene_scores = pd.read_csv('../data/signature_scores_beltram.csv',
                              index_col=None,
                              header=0,
                              sep=',')
    gene_score_caseid = []
    drop_rows = []
    matching_scores = []
    matching_indices = []
    for i, (idx, sn) in enumerate(
            zip(gene_scores.index.values,
                gene_scores['Surgical Number'].values)):
        try:
            x = int(sn.split(' ')[-1])
            if x in m1_case_numbers:
                gene_score_caseid.append(x)
                matching_indices.append(idx)
                matching_scores.append(m1_case_aggr[m1_case_numbers == x][0])
            elif x in m0p_case_numbers:
                gene_score_caseid.append(x)
                matching_indices.append(idx)
                matching_scores.append(m0p_case_aggr[m0p_case_numbers == x][0])
            else:
                drop_rows.append(idx)
        except:
            drop_rows.append(idx)

    gene_scores.drop(drop_rows, inplace=True)

    label_cols = ['caseid', 'Disease Stage', 'sample name', 'Surgical Number']
    gene_scores.drop(label_cols, inplace=True, axis=1)
    gene_scores['NEPC Score'] = pd.Series(matching_scores,
                                          index=matching_indices)

    plt.figure(figsize=(5, 5), dpi=300)
    sns.pairplot(gene_scores, kind='reg')
    plt.savefig('gene_scores_nepc_score_{}_tile.png'.format(args.aggr_fn),
                bbox_inches='tight')

    test_cols = [x for x in gene_scores.columns if x != 'NEPC Score']
    scores = gene_scores['NEPC Score'].values
    print(
        '------------------------------------------------------------------------------------'
    )
    for c in test_cols:
        ctest = spearmanr(scores, gene_scores[c].values)
        print('spearman {:40}: {:3.5f} p={:3.5f}'.format(
            c, ctest.correlation, ctest.pvalue))
        ctest = pearsonr(scores, gene_scores[c].values)
        print('pearson  {:40}: {:3.5f} p={:3.5f}'.format(
            c, ctest[0], ctest[1]))

    print(
        '------------------------------------------------------------------------------------'
    )
    if args.boxplot:
        f, (ax_box,
            ax_hist) = plt.subplots(2,
                                    sharex=True,
                                    gridspec_kw={"height_ratios": (.35, .65)})
        plt_m0 = train_aggr[train_case_y == 0]
        plt_nepc = train_aggr[train_case_y == 1]
        plt_m1 = m1_case_aggr
        plt_m0p = m0p_case_aggr
        sns.distplot(
            plt_m0,
            bins=25,
            norm_hist=True,
            kde=True,
            label='M0',
            ax=ax_hist,
        )
        sns.distplot(
            plt_nepc,
            bins=25,
            norm_hist=True,
            kde=True,
            label='NEPC',
            ax=ax_hist,
        )
        sns.distplot(
            plt_m1,
            kde=True,
            norm_hist=True,
            bins=25,
            label='M1',
            ax=ax_hist,
        )
        sns.distplot(
            plt_m0p,
            kde=True,
            norm_hist=True,
            bins=25,
            label='M0-P',
            ax=ax_hist,
        )
        ax_hist.set_xlabel('Score')
        ax_hist.set_ylabel('Frequency')
        concat_scores = np.concatenate([plt_m0, plt_nepc, plt_m1, plt_m0p])
        concat_labels = np.array(['M0'] * len(plt_m0) +
                                 ['NEPC'] * len(plt_nepc) +
                                 ['M1'] * len(plt_m1) + ['M0P'] * len(plt_m0p))
        plt_df = pd.DataFrame({'Set': concat_labels, 'Score': concat_scores})

        sns.boxplot(y='Set', x='Score', data=plt_df, ax=ax_box)
        sns.stripplot(y='Set',
                      x='Score',
                      data=plt_df,
                      size=2.5,
                      jitter=True,
                      linewidth=0.5,
                      ax=ax_box)
        plt.savefig('NEPC_score_{}_tile.png'.format(args.aggr_fn),
                    bbox_inches='tight')
Ejemplo n.º 11
0
def main(args):
    feat, case_ids = load_features(args.src, zscore=True)
    lab = load_labels(args.labsrc)

    feat = drop_high_cor(feat, cor_thresh=0.8)
    print('Features after high cor drop')
    print(feat.head())

    is_nepc = np.array([x in nepc_strs for x in lab['stage_str']])
    is_adeno = np.array([x in adeno_strs for x in lab['stage_str']])
    is_m0 = np.array([x in m0_strs for x in lab['stage_str']])
    is_m0p = np.array([x in m0p_strs for x in lab['stage_str']])
    is_m1 = np.array([x in m1_strs for x in lab['stage_str']])

    nepc_case_feat = feat.loc[is_nepc, :]
    nepc_lab = lab.loc[is_nepc, :]
    adeno_case_feat = feat.loc[is_adeno, :]
    adeno_lab = lab.loc[is_adeno, :]
    m0_case_feat = feat.loc[is_m0, :]
    m0_lab = lab.loc[is_m0, :]
    m0p_case_feat = feat.loc[is_m0p, :]
    m0p_lab = lab.loc[is_m0p, :]
    m1_case_feat = feat.loc[is_m1, :]
    m1_lab = lab.loc[is_m1, :]

    if args.reduce_case:
        nepc_case_feat = nepc_case_feat.groupby(nepc_lab['case_id']).mean()
        adeno_case_feat = adeno_case_feat.groupby(adeno_lab['case_id']).mean()
        m0_case_feat = m0_case_feat.groupby(m0_lab['case_id']).mean()
        m0p_case_feat = m0p_case_feat.groupby(m0p_lab['case_id']).mean()
        m1_case_feat = m1_case_feat.groupby(m1_lab['case_id']).mean()

    print('NEPC features:', nepc_case_feat.shape)
    print('Adeno features:', adeno_case_feat.shape)
    print('M0 features:', m0_case_feat.shape)
    print('M0p features:', m0p_case_feat.shape)
    print('M1 features:', m1_case_feat.shape)

    nepc_adeno_p = []
    m0_m1_p = []
    m0_m0p_p = []
    for c in nepc_case_feat.columns:
        nepc_ = nepc_case_feat[c].values
        adeno_ = adeno_case_feat[c].values
        m0_ = m0_case_feat[c].values
        m0p_ = m0p_case_feat[c].values
        m1_ = m1_case_feat[c].values

        tt_nepc_adeno = ttest_ind(nepc_, adeno_)
        tt_m0_m1 = ttest_ind(m0_, m1_)
        tt_m0_m0p = ttest_ind(m0_, m0p_)

        nepc_adeno_p.append(tt_nepc_adeno[1])
        m0_m1_p.append(tt_m0_m1[1])
        m0_m0p_p.append(tt_m0_m0p[1])

    nepc_adeno_reject, nepc_adeno_q, _, _ = multipletests(nepc_adeno_p,
                                                          alpha=0.01,
                                                          method='fdr_bh')
    m0_m1_reject, m0_m1_q, _, _ = multipletests(m0_m1_p,
                                                alpha=0.01,
                                                method='fdr_bh')
    m0_m0p_reject, m0_m0p_q, _, _ = multipletests(m0_m0p_p,
                                                  alpha=0.01,
                                                  method='fdr_bh')

    print('Rejecting {} '.format(np.sum(nepc_adeno_reject)))
    print('Rejecting {} '.format(np.sum(m0_m1_reject)))
    print('Rejecting {} '.format(np.sum(m0_m0p_reject)))

    np.save('nepc_adeno_reject.npy', np.array(nepc_adeno_reject))
    np.save('m0_m1_reject.npy', np.array(m0_m1_reject))
    np.save('m0_m0p_reject.npy', np.array(m0_m0p_reject))

    for i, c in enumerate(nepc_case_feat.columns):
        print('plotting feature ', c)
        if not nepc_adeno_reject[i]:
            tt = True
        elif not m0_m1_reject[i]:
            tt = True
        elif not m0_m0p_reject[i]:
            tt = True
        else:
            tt = False

        if tt:
            nepc_ = nepc_case_feat[c].values
            adeno_ = adeno_case_feat[c].values
            m0_ = m0_case_feat[c].values
            m0p_ = m0p_case_feat[c].values
            m1_ = m1_case_feat[c].values

            plt.clf()
            sns.distplot(nepc_, label='NEPC')
            sns.distplot(adeno_, label='Adeno')
            sns.distplot(m0_, label='M0')
            sns.distplot(m0p_, label='M0-P')
            sns.distplot(m1_, label='M1')

            plt.legend(frameon=True)
            plt.title('{}\nnepc q={:.3E}\nm1 q={:.3E}\nm0p q={:.3E}'.format(
                c, nepc_adeno_q[i], m0_m1_q[i], m0_m0p_q[i]))

            saveto = os.path.join(args.dst, '{}.png'.format(c))
            plt.savefig(saveto, bbox_inches='tight')
        else:
            print('skipping feature ', c)