コード例 #1
0
def main():

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/candidate_features/dataset_info.txt'
    dataset_infos = datasetIO.load_datasetinfo(dataset_info_path)

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/nonredundant_features'
    if not os.path.exists(results_folder):
        os.mkdir(results_folder)

    # iterate over datasets
    print('iterating over datasets...', flush=True)
    for dataset_info in dataset_infos:

        #        # just work with hpatissuesmrna for testing/debugging the pipeline
        #        if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned':
        #            print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True)
        #            continue

        # check if another python instance is already working on this dataset
        if os.path.exists('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation'])):
            print('skipping {0}. already in progress...'.format(
                dataset_info['abbreviation']),
                  flush=True)
            continue

        # log start of processing
        with open('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            print('working on {0}...'.format(dataset_info['abbreviation']),
                  flush=True)
            fw.write('working on {0}...'.format(dataset_info['abbreviation']))

        # load dataset
        print('loading dataset...', flush=True)
        gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path'])
        gene_atb.columnmeta['isrowstat'] = gene_atb.columnmeta[
            'isrowstat'].astype('int64').astype('bool')

        # decide feature similarity metric
        print('deciding feature similarity metric...', flush=True)
        if ('standardized' in dataset_info['abbreviation']
                or 'cleaned' in dataset_info['abbreviation']
            ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5:
            # dataset is many-valued and filled-in
            print('    dataset is many-valued and filled-in...', flush=True)
            print('    using spearman for similarity...', flush=True)
            dataset_info['feature_similarity_metric'] = 'spearman'
            dataset_info['feature_similarity_threshold'] = np.sqrt(0.5)
        else:
            # dataset is binary or tertiary or sparse
            print('    dataset is binary, tertiary, or sparse...', flush=True)
            print('    using cosine for similarity...', flush=True)
            dataset_info['feature_similarity_metric'] = 'cosine'
            dataset_info['feature_similarity_threshold'] = np.sqrt(0.5)

        # calculate feature similarity
        print('calculating feature similarity...', flush=True)
        atb_atb = gene_atb.tosimilarity(
            axis=1, metric=dataset_info['feature_similarity_metric'])

        # prioritize feature groups
        print('prioritizing feature groups...', flush=True)
        are_similar_features = np.abs(
            atb_atb.matrix) > dataset_info['feature_similarity_threshold']
        feature_group_size = are_similar_features.sum(1).astype('float64')
        feature_group_score = (np.abs(
            atb_atb.matrix) * are_similar_features).sum(1) / feature_group_size
        feature_priority = np.zeros(gene_atb.shape[1], dtype='float64')
        feature_priority[gene_atb.columnlabels == 'mean'] = 1.0
        feature_priority[gene_atb.columnlabels == 'stdv'] = 0.5
        feature_infos = list(
            zip(np.arange(gene_atb.shape[1], dtype='int64'),
                gene_atb.columnlabels.copy(), feature_group_size.copy(),
                feature_priority.copy(), feature_group_score.copy()))
        feature_infos.sort(key=itemgetter(4), reverse=True)
        feature_infos.sort(key=itemgetter(3), reverse=True)
        feature_infos.sort(key=itemgetter(2), reverse=True)
        #        for feature_info in feature_infos:
        #            print('{0:1.3g}, {1}, {2:1.3g}, {3:1.3g}, {4:1.3g}'.format(feature_info[0], feature_info[1], feature_info[2], feature_info[3], feature_info[4]))
        sorted_feature_indices = np.array(
            [feature_info[0] for feature_info in feature_infos], dtype='int64')
        atb_atb.reorder(sorted_feature_indices, axis=0)
        atb_atb.reorder(sorted_feature_indices, axis=1)
        gene_atb.reorder(sorted_feature_indices, axis=1)
        are_similar_features = are_similar_features[
            sorted_feature_indices, :][:, sorted_feature_indices]

        # group similar features
        print('grouping similar features...', flush=True)
        tobediscarded = np.zeros(gene_atb.shape[1], dtype='bool')
        gene_atb.columnmeta['similar_features'] = np.full(gene_atb.shape[1],
                                                          '',
                                                          dtype='object')
        gene_atb.columnmeta['preferred_rowstat'] = np.full(gene_atb.shape[1],
                                                           '',
                                                           dtype='object')
        rowstats = gene_atb.columnlabels[gene_atb.columnmeta['isrowstat']]
        with open('{0}/{1}_feature_groups.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            for i, feature in enumerate(gene_atb.columnlabels):
                if ~tobediscarded[i]:
                    # find similar features
                    print('    finding features similar to feature "{0}"...'.
                          format(feature),
                          flush=True)
                    similarity_hit = are_similar_features[i, :]
                    similarity_hit = np.logical_and(
                        similarity_hit, ~tobediscarded)  # just what's new
                    similarity_hit[:i] = False
                    similar_features = gene_atb.columnlabels[similarity_hit]
                    similarity_values = atb_atb.matrix[i, similarity_hit]
                    rowstat_is_in_group = np.in1d(rowstats, similar_features)
                    gene_atb.columnmeta['similar_features'][i] = '|'.join(
                        similar_features.tolist())
                    if rowstat_is_in_group.any():
                        # replace feature with summary stat
                        gene_atb.columnmeta['preferred_rowstat'][i] = rowstats[
                            rowstat_is_in_group.nonzero()[0][0]]
                        gene_atb.matrix[:, i] = gene_atb.select(
                            [], gene_atb.columnmeta['preferred_rowstat'][i])
                        print(
                            '        replacing feature "{0}" with summary stat "{1}"...'
                            .format(
                                feature,
                                gene_atb.columnmeta['preferred_rowstat'][i]),
                            flush=True)
                    elif similarity_hit.sum() > 1:
                        # replace feature with group average
                        print(
                            '        replacing feature "{0}" with average of {1!s} features...'
                            .format(feature, similarity_hit.sum()),
                            flush=True)
                        feature_weight = atb_atb.matrix[i, similarity_hit]
                        feature_weight = feature_weight / np.sum(
                            np.abs(feature_weight))
                        gene_atb.matrix[:, i] = (
                            gene_atb.matrix[:, similarity_hit] *
                            (feature_weight.reshape(1, -1))).sum(1)
                    else:
                        print('        no similar features...', flush=True)
                    fw.write('\t'.join([
                        '{0}|{1:1.6g}'.format(f, v)
                        for f, v in zip(similar_features, similarity_values)
                    ]) + '\n')
                    similarity_hit[i] = False
                    tobediscarded = np.logical_or(tobediscarded,
                                                  similarity_hit)

        # discard features absorbed into group features
        print('discarding features absorbed into group features...',
              flush=True)
        if tobediscarded.any():
            # discard features
            print('    discarding {0!s} features. {1!s} features remaining...'.
                  format(tobediscarded.sum(), (~tobediscarded).sum()),
                  flush=True)
            gene_atb.discard(tobediscarded, axis=1)
        else:
            # keep all features
            print('    no features to discard. {0!s} features remaining...'.
                  format(gene_atb.shape[1]),
                  flush=True)

        # save if dataset has content
        print('saving if dataset has content...', flush=True)
        if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0:
            # no content
            print('    nothing to save...', flush=True)
        else:
            # save nonredundant features
            print('    saving {0!s} nonredundant features...'.format(
                gene_atb.shape[1]),
                  flush=True)
            dataset_info['path'] = '{0}/{1}.txt.gz'.format(
                results_folder, dataset_info['abbreviation'])
            dataset_info['nonredundant_genes'] = gene_atb.shape[0]
            dataset_info['nonredundant_features'] = gene_atb.shape[1]
            datasetIO.save_datamatrix(dataset_info['path'], gene_atb)
            datasetIO.append_datasetinfo(
                '{0}/dataset_info.txt'.format(results_folder), dataset_info)

    print('done.', flush=True)
コード例 #2
0
def main():

    # load class examples
    print('loading class examples...', flush=True)
    class_examples_folder = 'targets/pharmaprojects'
    class_examples = {
        'positive':
        datasetIO.load_examples(
            '{0}/positive.txt'.format(class_examples_folder)),
        'negative':
        datasetIO.load_examples(
            '{0}/negative.txt'.format(class_examples_folder)),
        'unknown':
        datasetIO.load_examples(
            '{0}/unknown.txt'.format(class_examples_folder))
    }

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/harmonizome/dataset_info.txt'
    dataset_infos = datasetIO.load_datasetinfo(dataset_info_path)

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/candidate_features'
    if not os.path.exists(results_folder):
        os.mkdir(results_folder)

    # iterate over datasets
    print('iterating over datasets...', flush=True)
    for dataset_info in dataset_infos:

        #        # just work with hpatissuesmrna for testing/debugging the pipeline
        #        if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned':
        #            print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True)
        #            continue

        # check if another python instance is already working on this dataset
        if os.path.exists('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation'])):
            print('skipping {0}. already in progress...'.format(
                dataset_info['abbreviation']),
                  flush=True)
            continue

        # log start of processing
        with open('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            print('working on {0}...'.format(dataset_info['abbreviation']),
                  flush=True)
            fw.write('working on {0}...'.format(dataset_info['abbreviation']))

        # load dataset
        print('loading dataset...', flush=True)
        gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path'])
        dataset_info['original_genes'] = gene_atb.shape[0]
        dataset_info['original_features'] = gene_atb.shape[1]

        # decide feature normalization
        print('deciding feature normalization...', flush=True)
        if ('standardized' in dataset_info['abbreviation']
                or 'cleaned' in dataset_info['abbreviation']
            ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5:
            # dataset is many-valued and filled-in
            print('    dataset is many-valued and filled-in...', flush=True)
            print('    z-scoring features...', flush=True)
            dataset_info['feature_normalization'] = 'z-score'
            mnv = np.nanmean(gene_atb.matrix, axis=0, keepdims=True)
            sdv = np.nanstd(gene_atb.matrix, axis=0, keepdims=True)
            gene_atb.matrix = (gene_atb.matrix - mnv) / sdv
            gene_atb.columnmeta['mean'] = mnv.reshape(-1)
            gene_atb.columnmeta['stdv'] = sdv.reshape(-1)
        else:
            # dataset is binary or tertiary or sparse
            print('    dataset is binary, tertiary, or sparse...', flush=True)
            print('    no feature normalization...', flush=True)
            dataset_info['feature_normalization'] = 'none'

        # assign class labels to genes
        print('assigning class labels to genes...', flush=True)
        gene_atb.rowmeta['class'] = np.full(gene_atb.shape[0],
                                            'unknown',
                                            dtype='object')
        gene_atb.rowmeta['class'][np.in1d(
            gene_atb.rowlabels, list(class_examples['positive']))] = 'positive'
        gene_atb.rowmeta['class'][np.in1d(
            gene_atb.rowlabels, list(class_examples['negative']))] = 'negative'

        # add dataset mean and stdv as features
        print('adding dataset mean and stdv as features...', flush=True)
        gene_stat = dataclasses.datamatrix(
            rowname=gene_atb.rowname,
            rowlabels=gene_atb.rowlabels.copy(),
            rowmeta=copy.deepcopy(gene_atb.rowmeta),
            columnname=gene_atb.columnname,
            columnlabels=np.array(['mean', 'stdv'], dtype='object'),
            columnmeta={},
            matrixname=gene_atb.matrixname,
            matrix=np.append(gene_atb.matrix.mean(1, keepdims=True),
                             gene_atb.matrix.std(1, keepdims=True), 1))
        gene_atb.append(gene_stat, 1)
        gene_atb.columnmeta['isrowstat'] = np.in1d(gene_atb.columnlabels,
                                                   gene_stat.columnlabels)
        del gene_stat

        # identify features with little information about labelled examples
        print(
            'identifying features with little information about labelled examples...',
            flush=True)
        isunknown = gene_atb.rowmeta['class'] == 'unknown'
        tobediscarded = np.logical_or.reduce(
            ((gene_atb.matrix[~isunknown, :] != 0).sum(axis=0) < 3,
             (gene_atb.matrix[~isunknown, :] != 1).sum(axis=0) < 3,
             np.isnan(gene_atb.matrix[~isunknown, :]).any(axis=0)))
        if tobediscarded.any():
            # discard features
            print('    discarding {0!s} features. {1!s} features remaining...'.
                  format(tobediscarded.sum(), (~tobediscarded).sum()),
                  flush=True)
            gene_atb.discard(tobediscarded, axis=1)
        else:
            # keep all features
            print('    no features to discard. {0!s} features remaining...'.
                  format(gene_atb.shape[1]),
                  flush=True)

        # save if dataset has content
        print('saving if dataset has content...', flush=True)
        if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0:
            # no content
            print('    nothing to save...', flush=True)
        else:
            # save candidate features
            print('    saving {0!s} candidate features...'.format(
                gene_atb.shape[1]),
                  flush=True)
            dataset_info['path'] = '{0}/{1}.txt.gz'.format(
                results_folder, dataset_info['abbreviation'])
            dataset_info['candidate_genes'] = gene_atb.shape[0]
            dataset_info['candidate_features'] = gene_atb.shape[1]
            dataset_info['positive_examples'] = (
                gene_atb.rowmeta['class'] == 'positive').sum()
            dataset_info['negative_examples'] = (
                gene_atb.rowmeta['class'] == 'negative').sum()
            dataset_info['unknown_examples'] = (
                gene_atb.rowmeta['class'] == 'unknown').sum()
            datasetIO.save_datamatrix(dataset_info['path'], gene_atb)
            datasetIO.append_datasetinfo(
                '{0}/dataset_info.txt'.format(results_folder), dataset_info)

    print('done.', flush=True)
コード例 #3
0
def main(validation_rep=0, validation_fold=0):

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/merged_features/rep{0!s}_fold{1!s}/dataset_info.txt'.format(
        validation_rep, validation_fold)
    dataset_info = datasetIO.load_datasetinfo(dataset_info_path)[0]

    # load validation examples
    print('loading validation examples...', flush=True)
    validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format(
        validation_rep, validation_fold)
    with open(validation_examples_path,
              mode='rt',
              encoding='utf-8',
              errors='surrogateescape') as fr:
        validation_examples = fr.read().split('\n')

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/useful_features/rep{0!s}_fold{1!s}'.format(
        validation_rep, validation_fold)
    results_folder_parts = results_folder.split('/')
    for i in range(len(results_folder_parts)):
        results_folder_part = '/'.join(results_folder_parts[:i + 1])
        if not os.path.exists(results_folder_part):
            os.mkdir(results_folder_part)

    # load dataset
    print('loading dataset {0}...'.format(dataset_info['abbreviation']),
          flush=True)
    gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path'])

    # specify cross-validation parameters
    print('specifying cross-validation parameters...', flush=True)
    reps = 20
    folds = 5
    rf_trees = 1000
    include_logistic_regression = True
    skf = StratifiedKFold(n_splits=folds, shuffle=True)
    print('    reps: {0!s}'.format(reps))
    print('    folds: {0!s}'.format(folds))

    # initialize models
    print('initializing models...', flush=True)
    rfmodel = RandomForestClassifier(n_estimators=rf_trees,
                                     oob_score=False,
                                     n_jobs=-1,
                                     class_weight='balanced')
    print(rfmodel)
    lrmodel = LogisticRegression(penalty='l2',
                                 dual=False,
                                 tol=0.0001,
                                 C=1e3,
                                 fit_intercept=True,
                                 intercept_scaling=1e3,
                                 class_weight='balanced',
                                 random_state=None,
                                 solver='liblinear',
                                 max_iter=100,
                                 multi_class='ovr',
                                 verbose=0,
                                 warm_start=False,
                                 n_jobs=1)
    print(lrmodel)

    # initialize data matrices for collecting model feature importances and cross-validation performance stats
    print(
        'initializing data matrices for collecting model feature importances and cross-validation performance stats...',
        flush=True)
    classifier_stats = np.array([
        'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr', 'fpr',
        'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc', 'fomr',
        'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1s', 'mcc',
        'fnlp'
    ],
                                dtype='object')
    sm = dataclasses.datamatrix(
        rowname='classifier_performance_stat',
        rowlabels=classifier_stats.copy(),
        rowmeta={},
        columnname='model',
        columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])],
                              dtype='object'),
        columnmeta={
            'num_features': np.zeros(gene_atb.shape[1], dtype='int64'),
            'features': np.full(gene_atb.shape[1], '', dtype='object'),
            'oob_score': np.zeros(gene_atb.shape[1], dtype='float64')
        },
        matrixname='crossvalidation_classifier_performance_stats_vs_models',
        matrix=np.zeros((classifier_stats.size, gene_atb.shape[1]),
                        dtype='float64'))
    stat_model_rf_mean = copy.deepcopy(sm)
    stat_model_rf_stdv = copy.deepcopy(sm)
    stat_model_lr_mean = copy.deepcopy(sm)
    stat_model_lr_stdv = copy.deepcopy(sm)
    del sm
    fm = dataclasses.datamatrix(
        rowname=gene_atb.columnname,
        rowlabels=gene_atb.columnlabels.copy(),
        rowmeta=copy.deepcopy(gene_atb.columnmeta),
        columnname='model',
        columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])],
                              dtype='object'),
        columnmeta={
            'num_features': np.zeros(gene_atb.shape[1], dtype='int64'),
            'features': np.full(gene_atb.shape[1], '', dtype='object'),
            'oob_score': np.zeros(gene_atb.shape[1], dtype='float64')
        },
        matrixname='model_feature_importances',
        matrix=np.zeros((gene_atb.shape[1], gene_atb.shape[1]),
                        dtype='float64'))
    feature_model_rf = copy.deepcopy(fm)
    feature_model_lr = copy.deepcopy(fm)
    del fm

    # exclude validation and unlabeled examples from cross-validation loop
    print(
        'excluding validation and unlabeled examples from cross-validation loop...',
        flush=True)
    isvalidation = np.in1d(gene_atb.rowlabels, validation_examples)
    isunknown = gene_atb.rowmeta['class'] == 'unknown'
    istraintest = ~np.logical_or(isvalidation, isunknown)
    Y = (gene_atb.rowmeta['class'][istraintest] == 'positive')
    #X = gene_atb.matrix[istraintest,:]

    # perform incremental feature elimination with cross-validation
    print(
        'performing incremental feature elimination with cross-validation...',
        flush=True)
    for i in range(gene_atb.shape[1]):
        print('    features: {0!s}...'.format(gene_atb.shape[1] - i),
              flush=True)
        if i == 0:
            hit_rf = np.ones(gene_atb.shape[1], dtype='bool')
            hit_lr = np.ones(gene_atb.shape[1], dtype='bool')
        else:
            hit_rf = feature_model_rf.matrix[:,
                                             i - 1] > feature_model_rf.matrix[
                                                 feature_model_rf.
                                                 matrix[:, i - 1] > 0,
                                                 i - 1].min()
            #hit_lr = feature_model_lr.matrix[:,i-1] > feature_model_lr.matrix[feature_model_lr.matrix[:,i-1] > 0,i-1].min()
            hit_lr = hit_rf
        X_rf = gene_atb.matrix[istraintest, :][:, hit_rf]
        X_lr = gene_atb.matrix[istraintest, :][:, hit_lr]
        stat_rep_rf = np.zeros((classifier_stats.size, reps), dtype='float64')
        stat_rep_lr = np.zeros((classifier_stats.size, reps), dtype='float64')
        fi_rep_rf = np.zeros((X_rf.shape[1], reps), dtype='float64')
        fi_rep_lr = np.zeros((X_lr.shape[1], reps), dtype='float64')
        for rep in range(reps):
            print('        rep {0!s} of {1!s}...'.format(rep + 1, reps),
                  flush=True)
            Ptest_rf = np.zeros(Y.size, dtype='float64')
            Ptest_lr = np.zeros(Y.size, dtype='float64')
            fi_fold_rf = np.zeros((X_rf.shape[1], folds), dtype='float64')
            fi_fold_lr = np.zeros((X_lr.shape[1], folds), dtype='float64')
            for fold, (train_indices,
                       test_indices) in enumerate(skf.split(X_rf, Y)):
                print('            fold {0!s} of {1!s}...'.format(
                    fold + 1, folds),
                      flush=True)
                Y_train = Y[train_indices]
                X_rf_train = X_rf[train_indices]
                X_lr_train = X_lr[train_indices]
                #Y_test = Y[test_indices]
                X_rf_test = X_rf[test_indices]
                X_lr_test = X_lr[test_indices]
                rfmodel.fit(X_rf_train, Y_train)
                Ptest_rf[test_indices] = rfmodel.predict_proba(
                    X_rf_test)[:, rfmodel.classes_ == 1].reshape(-1)
                fi_fold_rf[:, fold] = rfmodel.feature_importances_
                lrmodel.fit(X_lr_train, Y_train)
                Ptest_lr[test_indices] = lrmodel.predict_proba(
                    X_lr_test)[:, lrmodel.classes_ == 1].reshape(-1)
                fi_fold_lr[:, fold] = np.abs(lrmodel.coef_.reshape(-1))
            fi_rep_rf[:, rep] = fi_fold_rf.mean(1)
            stat_cut = modelevaluation.get_classifier_performance_stats(
                Y=Y,
                P=Ptest_rf,
                classifier_stats=classifier_stats,
                plot_curves=False,
                get_priority_cutoffs=True)
            stat_rep_rf[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[
                'p50_cutoff']].reshape(-1)
            fi_rep_lr[:, rep] = fi_fold_lr.mean(1)
            stat_cut = modelevaluation.get_classifier_performance_stats(
                Y=Y,
                P=Ptest_lr,
                classifier_stats=classifier_stats,
                plot_curves=False,
                get_priority_cutoffs=True)
            stat_rep_lr[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[
                'p50_cutoff']].reshape(-1)
        feature_model_rf.matrix[hit_rf, i] = fi_rep_rf.mean(1)
        feature_model_rf.columnmeta['num_features'][i] = gene_atb.shape[1] - i
        feature_model_rf.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_rf].tolist())
        stat_model_rf_mean.matrix[:, i] = stat_rep_rf.mean(1)
        stat_model_rf_mean.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_rf_mean.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_rf].tolist())
        stat_model_rf_stdv.matrix[:, i] = stat_rep_rf.std(1)
        stat_model_rf_stdv.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_rf_stdv.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_rf].tolist())
        feature_model_lr.matrix[hit_lr, i] = fi_rep_lr.mean(1)
        feature_model_lr.columnmeta['num_features'][i] = gene_atb.shape[1] - i
        feature_model_lr.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_lr].tolist())
        stat_model_lr_mean.matrix[:, i] = stat_rep_lr.mean(1)
        stat_model_lr_mean.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_lr_mean.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_lr].tolist())
        stat_model_lr_stdv.matrix[:, i] = stat_rep_lr.std(1)
        stat_model_lr_stdv.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_lr_stdv.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_lr].tolist())

    # concatenate data matrices with model feature importances
    print('concatenating data matrices with model feature importances...',
          flush=True)
    feature_model_rf.columnlabels += '_rf'
    feature_model_rf.columnmeta['model_type'] = np.full(
        feature_model_rf.shape[1], 'random_forest', dtype='object')
    feature_model_lr.columnlabels += '_lr'
    feature_model_lr.columnmeta['model_type'] = np.full(
        feature_model_lr.shape[1], 'logistic_regression', dtype='object')
    feature_model_rf.append(feature_model_lr, 1)
    feature_model = feature_model_rf
    del feature_model_rf, feature_model_lr

    # concatenate data matrices with model cross-validation performance stats
    print(
        'concatenating data matrices with model cross-validation performance stats...',
        flush=True)
    stat_model_rf_mean.rowlabels += '_mean'
    stat_model_rf_stdv.rowlabels += '_stdv'
    stat_model_rf_mean.append(stat_model_rf_stdv, 0)
    stat_model_rf_mean.columnlabels += '_rf'
    stat_model_rf_mean.columnmeta['model_type'] = np.full(
        stat_model_rf_mean.shape[1], 'random_forest', dtype='object')
    stat_model_lr_mean.rowlabels += '_mean'
    stat_model_lr_stdv.rowlabels += '_stdv'
    stat_model_lr_mean.append(stat_model_lr_stdv, 0)
    stat_model_lr_mean.columnlabels += '_lr'
    stat_model_lr_mean.columnmeta['model_type'] = np.full(
        stat_model_lr_mean.shape[1], 'logistic_regression', dtype='object')
    stat_model_rf_mean.append(stat_model_lr_mean, 1)
    stat_model = stat_model_rf_mean
    del stat_model_rf_mean

    # select simplest model (fewest features) with auroc and auprc within 95% of max
    print(
        'selecting simplest model (fewest features) with auroc and auprc within 95% of max...',
        flush=True)
    model_scores = 0.5 * (stat_model.select('auroc_mean', []) +
                          stat_model.select('auprc_mean', []))
    if include_logistic_regression:
        selected_model_index = np.where(
            model_scores >= 0.95 * model_scores.max())[0][-1]
    else:
        selected_model_index = np.where(
            np.logical_and(
                model_scores >=
                0.95 * model_scores[stat_model.columnmeta['model_type'] ==
                                    'random_forest'].max(),
                stat_model.columnmeta['model_type'] == 'random_forest'))[0][-1]
    selected_model_name = stat_model.columnlabels[selected_model_index]
    selected_model_features = feature_model.rowlabels[
        feature_model.matrix[:, selected_model_index] != 0]
    selected_model_type = stat_model.columnmeta['model_type'][
        selected_model_index]
    selected_model = rfmodel if selected_model_type == 'random_forest' else lrmodel
    gene_atb = gene_atb.tolabels(columnlabels=selected_model_features)
    feature_model_selected = feature_model.tolabels(
        columnlabels=selected_model_name)
    stat_model_selected = stat_model.tolabels(columnlabels=selected_model_name)
    print('    selected_model_name: {0}'.format(selected_model_name),
          flush=True)
    print('    selected_model_features: {0}'.format(
        '|'.join(selected_model_features)),
          flush=True)

    # iterate over selected features to rebuild design matrix
    print('iterating over selected features to rebuild design matrix...',
          flush=True)
    for i, (selected_feature, dataset_abbreviation) in enumerate(
            zip(gene_atb.columnlabels,
                gene_atb.columnmeta['dataset_abbreviation'])):

        # load dataset
        print('    loading dataset {0}...'.format(dataset_abbreviation),
              flush=True)
        dataset_path = 'datasets/generalizable_features/rep{0!s}_fold{1!s}/{2}.txt.gz'.format(
            validation_rep, validation_fold, dataset_abbreviation)
        gene_atb_i = datasetIO.load_datamatrix(dataset_path)
        gene_atb_i.columnmeta[
            'generalizability_pvalues_corrected'] = gene_atb_i.columnmeta[
                'generalizability_pvalues_corrected'].astype('float64')
        gene_atb_i.columnmeta['dataset_abbreviation'] = np.full(
            gene_atb_i.shape[1], dataset_abbreviation, dtype='object')
        gene_atb_i.columnmeta[
            'dataset_feature'] = gene_atb_i.columnlabels.copy()
        gene_atb_i.columnlabels += '_' + dataset_abbreviation
        gene_atb_i.rowname = 'GeneSym'
        gene_atb_i.columnname = 'Feature'
        if dataset_abbreviation == 'gtextissue_cleaned':
            gene_atb_i.discard(gene_atb_i.rowlabels == 'C12ORF55',
                               0)  # pesky duplicate row
        print(gene_atb_i)

        # select feature
        print('    selecting feature {0}...'.format(selected_feature),
              flush=True)
        gene_atb_i.discard(gene_atb_i.columnlabels != selected_feature, 1)

        # merge dataset
        print('    merging dataset...', flush=True)
        if i == 0:
            gene_atb_selected = copy.deepcopy(gene_atb_i)
            gene_atb_selected.matrixname = 'merged_target_features'
            print('        first dataset, no merge...', flush=True)
        else:
            common_genes = np.intersect1d(gene_atb_selected.rowlabels,
                                          gene_atb_i.rowlabels)
            gene_atb_selected = gene_atb_selected.tolabels(
                rowlabels=common_genes)
            gene_atb_i = gene_atb_i.tolabels(rowlabels=common_genes)
            gene_atb_selected.append(gene_atb_i, 1)
            print('        common_genes: {0!s}...'.format(common_genes.size),
                  flush=True)

    # normalize features
    print('normalizing features...', flush=True)
    gene_atb_selected.columnmeta['min'] = gene_atb_selected.matrix.min(0)
    gene_atb_selected.columnmeta['max'] = gene_atb_selected.matrix.max(0)
    gene_atb_selected.matrix = (
        gene_atb_selected.matrix - gene_atb_selected.columnmeta['min'].reshape(
            1, -1)) / (gene_atb_selected.columnmeta['max'].reshape(1, -1) -
                       gene_atb_selected.columnmeta['min'].reshape(1, -1))

    # update metadata
    print('updating metadata...', flush=True)
    assert (gene_atb.columnlabels == gene_atb_selected.columnlabels).all()
    for field, values in gene_atb.columnmeta.items():
        if field not in gene_atb_selected.columnmeta:
            gene_atb_selected.columnmeta[field] = values
    print('old_num_genes:{0!s}\tnew_num_genes:{1!s}'.format(
        gene_atb.shape[0], gene_atb_selected.shape[0]),
          flush=True)
    del gene_atb

    # refit selected model
    print('refitting selected model...', flush=True)
    isvalidation = np.in1d(gene_atb_selected.rowlabels, validation_examples)
    isunknown = gene_atb_selected.rowmeta['class'] == 'unknown'
    istraintest = ~np.logical_or(isvalidation, isunknown)
    selected_model.fit(
        gene_atb_selected.matrix[istraintest, :],
        gene_atb_selected.rowmeta['class'][istraintest] == 'positive')

    # get predictions for validation and unlabelled examples
    print('getting predictions for validation and unlabelled examples...',
          flush=True)
    gene_model_selected = dataclasses.datamatrix(
        rowname=gene_atb_selected.rowname,
        rowlabels=gene_atb_selected.rowlabels.copy(),
        rowmeta=copy.deepcopy(gene_atb_selected.rowmeta),
        columnname=stat_model_selected.columnname,
        columnlabels=stat_model_selected.columnlabels.copy(),
        columnmeta=copy.deepcopy(stat_model_selected.columnmeta),
        matrixname=
        'success_probabilities_for_validation_and_unlabelled_examples',
        matrix=selected_model.predict_proba(
            gene_atb_selected.matrix)[:, selected_model.classes_ == 1])
    gene_model_selected.discard(istraintest, 0)

    # save results
    print('saving {0!s} useful features and model results...'.format(
        gene_atb_selected.shape[1]),
          flush=True)
    dataset_info['path'] = '{0}/{1}.txt.gz'.format(
        results_folder, dataset_info['abbreviation'])
    dataset_info['selected_model_name'] = selected_model_name
    dataset_info['selected_model_features'] = '|'.join(selected_model_features)
    dataset_info['selected_model_type'] = selected_model_type
    dataset_info['crossvalidation_reps'] = reps
    dataset_info['crossvalidation_folds'] = folds
    dataset_info['rf_trees'] = rf_trees
    dataset_info['include_logistic_regression'] = include_logistic_regression
    for stat_name, stat_values in zip(stat_model_selected.rowlabels,
                                      stat_model_selected.matrix):
        dataset_info[stat_name] = stat_values.item()
    datasetIO.save_datamatrix(dataset_info['path'], gene_atb_selected)
    datasetIO.save_datamatrix('{0}/stat_model.txt.gz'.format(results_folder),
                              stat_model)
    datasetIO.save_datamatrix(
        '{0}/feature_model.txt.gz'.format(results_folder), feature_model)
    datasetIO.save_datamatrix(
        '{0}/stat_model_selected.txt.gz'.format(results_folder),
        stat_model_selected)
    datasetIO.save_datamatrix(
        '{0}/feature_model_selected.txt.gz'.format(results_folder),
        feature_model_selected)
    datasetIO.save_datamatrix(
        '{0}/gene_model_selected.txt.gz'.format(results_folder),
        gene_model_selected)
    datasetIO.append_datasetinfo('{0}/dataset_info.txt'.format(results_folder),
                                 dataset_info)

    print('done.', flush=True)
コード例 #4
0
def main(validation_rep=0, validation_fold=0):

    # load target clusters
    print('loading target cluster assignments...', flush=True)
    target_cluster_path = 'targets/clusters/gene_cluster_byfamily.pickle'
    gene_cluster = datasetIO.load_clusterassignments(target_cluster_path)

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/nonredundant_features/dataset_info.txt'
    dataset_infos = datasetIO.load_datasetinfo(dataset_info_path)

    # load validation examples
    print('loading validation examples...', flush=True)
    validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format(
        validation_rep, validation_fold)
    with open(validation_examples_path,
              mode='rt',
              encoding='utf-8',
              errors='surrogateescape') as fr:
        validation_examples = fr.read().split('\n')

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/generalizable_features/rep{0!s}_fold{1!s}'.format(
        validation_rep, validation_fold)
    results_folder_parts = results_folder.split('/')
    for i in range(len(results_folder_parts)):
        results_folder_part = '/'.join(results_folder_parts[:i + 1])
        if not os.path.exists(results_folder_part):
            os.mkdir(results_folder_part)

    # iterate over datasets
    print('iterating over datasets...', flush=True)
    for dataset_info in dataset_infos:

        #        # just work with hpatissuesmrna for testing/debugging the pipeline
        #        if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned':
        #            print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True)
        #            continue

        # check if another python instance is already working on this dataset
        if os.path.exists('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation'])):
            print('skipping {0}. already in progress...'.format(
                dataset_info['abbreviation']),
                  flush=True)
            continue

        # log start of processing
        with open('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            print('working on {0}...'.format(dataset_info['abbreviation']),
                  flush=True)
            fw.write('working on {0}...'.format(dataset_info['abbreviation']))

        # load dataset
        print('loading dataset...', flush=True)
        gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path'])

        # specify feature generalizability test parameters
        print('specifying feature generalizability test parameters...',
              flush=True)
        dataset_info[
            'feature_generalizability_test_function'] = featureselection.univariate_grouppreserved_permtest
        dataset_info[
            'feature_generalizability_test_permutations'] = 10000  # 100000
        dataset_info[
            'feature_generalizability_test_targetclusterpath'] = target_cluster_path
        dataset_info[
            'multiple_hypothesis_testing_correction_function'] = featureselection.multiple_hypothesis_testing_correction
        dataset_info[
            'multiple_hypothesis_testing_correction_method'] = 'fdr_by'
        dataset_info['multiple_hypothesis_testing_correction_threshold'] = 0.05
        print('   feature_generalizability_test_function: {0}'.format(
            dataset_info['feature_generalizability_test_function']),
              flush=True)
        print('   feature_generalizability_test_permutations: {0!s}'.format(
            dataset_info['feature_generalizability_test_permutations']),
              flush=True)
        print('   feature_generalizability_test_targetclusterpath: {0}'.format(
            dataset_info['feature_generalizability_test_targetclusterpath']),
              flush=True)
        print('   multiple_hypothesis_testing_correction_function: {0}'.format(
            dataset_info['multiple_hypothesis_testing_correction_function']),
              flush=True)
        print('   multiple_hypothesis_testing_correction_method: {0}'.format(
            dataset_info['multiple_hypothesis_testing_correction_method']),
              flush=True)
        print('   multiple_hypothesis_testing_correction_threshold: {0!s}'.
              format(dataset_info[
                  'multiple_hypothesis_testing_correction_threshold']),
              flush=True)

        # exclude validation and unlabeled examples from significance calculation
        print(
            'excluding validation and unlabeled examples from significance calculation...',
            flush=True)
        isvalidation = np.in1d(gene_atb.rowlabels, validation_examples)
        isunknown = gene_atb.rowmeta['class'] == 'unknown'
        istraintest = ~np.logical_or(isvalidation, isunknown)

        # compute feature generalizability with multiple hypothesis testing correction
        print(
            'computing feature generalizability with multiple hypothesis testing correction...',
            flush=True)
        gene_atb.rowmeta['cluster'] = np.array([
            gene_cluster[g] if g in gene_cluster else -1
            for g in gene_atb.rowlabels
        ],
                                               dtype='int64')
        gene_atb.columnmeta[
            'generalizability_test_statistic_values'], gene_atb.columnmeta[
                'generalizability_pvalues'] = dataset_info[
                    'feature_generalizability_test_function'](
                        X=gene_atb.matrix[istraintest, :],
                        Y=(gene_atb.rowmeta['class'][istraintest] == 'positive'
                           ),
                        G=gene_atb.rowmeta['cluster'][istraintest],
                        numperm=dataset_info[
                            'feature_generalizability_test_permutations'])
        gene_atb.columnmeta['is_generalizable'], gene_atb.columnmeta[
            'generalizability_pvalues_corrected'] = dataset_info[
                'multiple_hypothesis_testing_correction_function'](
                    gene_atb.columnmeta['generalizability_pvalues'],
                    alpha=dataset_info[
                        'multiple_hypothesis_testing_correction_threshold'],
                    method=dataset_info[
                        'multiple_hypothesis_testing_correction_method'])
        gene_atb.columnmeta['generalizability_correlation_sign'] = np.sign(
            gene_atb.columnmeta['generalizability_test_statistic_values'])
        if (gene_atb.columnmeta['generalizability_pvalues'] <
                1 / dataset_info['feature_generalizability_test_permutations']
            ).any():
            print(
                '    warning: not enough permutations to establish all pvalues...',
                flush=True)
        tobediscarded = np.logical_or(
            np.isnan(gene_atb.columnmeta['generalizability_pvalues']),
            np.isnan(
                gene_atb.columnmeta['generalizability_pvalues_corrected']))
        if tobediscarded.any():
            gene_atb.discard(tobediscarded, axis=1)

        # prioritize features
        print('prioritizing features...', flush=True)
        sortedindices = np.argsort(
            gene_atb.columnmeta['generalizability_pvalues_corrected'])
        gene_atb.reorder(sortedindices, axis=1)

        # save feature generalizability info
        print('saving feature generalizability info...', flush=True)
        with open('{0}/{1}_feature_generalizability_info.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            writelist = [
                'dataset', 'abbreviation', 'feature',
                'generalizability_test_statistic', 'generalizability_pvalue',
                'generalizability_pvalue_corrected', 'is_generalizable',
                'generalizability_correlation_sign', 'preferred_rowstat',
                'similar_features'
            ]
            fw.write('\t'.join(writelist) + '\n')
            for j, feature in enumerate(gene_atb.columnlabels):
                writelist = [
                    dataset_info['name'], dataset_info['abbreviation'],
                    feature, '{0:1.5g}'.format(gene_atb.columnmeta[
                        'generalizability_test_statistic_values'][j]),
                    '{0:1.5g}'.format(
                        gene_atb.columnmeta['generalizability_pvalues'][j]),
                    '{0:1.5g}'.format(
                        gene_atb.
                        columnmeta['generalizability_pvalues_corrected'][j]),
                    '{0:1.5g}'.format(
                        gene_atb.columnmeta['is_generalizable'][j]),
                    '{0:1.5g}'.format(
                        gene_atb.
                        columnmeta['generalizability_correlation_sign'][j]),
                    gene_atb.columnmeta['preferred_rowstat'][j],
                    gene_atb.columnmeta['similar_features'][j]
                ]
                fw.write('\t'.join(writelist) + '\n')

        # discard features that are not generalizable
        print('discarding features that are not generalizable...', flush=True)
        tobediscarded = ~gene_atb.columnmeta['is_generalizable']
        if tobediscarded.any():
            # discard features
            print('    discarding {0!s} features. {1!s} features remaining...'.
                  format(tobediscarded.sum(), (~tobediscarded).sum()),
                  flush=True)
            gene_atb.discard(tobediscarded, axis=1)
        else:
            # keep all features
            print('    no features to discard. {0!s} features remaining...'.
                  format(gene_atb.shape[1]),
                  flush=True)

        # save if dataset has content
        print('saving if dataset has content...', flush=True)
        if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0:
            # no content
            print('    nothing to save...', flush=True)
        else:
            # save generalizable features
            print('    saving {0!s} generalizable features...'.format(
                gene_atb.shape[1]),
                  flush=True)
            dataset_info['path'] = '{0}/{1}.txt.gz'.format(
                results_folder, dataset_info['abbreviation'])
            dataset_info['generalizable_genes'] = gene_atb.shape[0]
            dataset_info['generalizable_features'] = gene_atb.shape[1]
            dataset_info[
                'feature_generalizability_test_function'] = 'featureselection.univariate_grouppreserved_permtest'
            dataset_info[
                'multiple_hypothesis_testing_correction_function'] = 'featureselection.multiple_hypothesis_testing_correction'
            datasetIO.save_datamatrix(dataset_info['path'], gene_atb)
            datasetIO.append_datasetinfo(
                '{0}/dataset_info.txt'.format(results_folder), dataset_info)

    print('done.', flush=True)
コード例 #5
0
def main(validation_rep=0, validation_fold=0):

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/generalizable_features/rep{0!s}_fold{1!s}/dataset_info.txt'.format(
        validation_rep, validation_fold)
    dataset_infos = datasetIO.load_datasetinfo(dataset_info_path)

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/merged_features/rep{0!s}_fold{1!s}'.format(
        validation_rep, validation_fold)
    results_folder_parts = results_folder.split('/')
    for i in range(len(results_folder_parts)):
        results_folder_part = '/'.join(results_folder_parts[:i + 1])
        if not os.path.exists(results_folder_part):
            os.mkdir(results_folder_part)

    # exclude mouse and small datasets
    print('excluding mouse datasets and datasets with few genes...',
          flush=True)
    dataset_infos = [
        dataset_info for dataset_info in dataset_infos
        if 'mouse' not in dataset_info['abbreviation']
        and int(dataset_info['generalizable_genes']) > 1900
    ]

    # exclude brain atlas datasets unless they're the only choice
    not_brainatlas = [
        'brainatlas' not in dataset_info['abbreviation']
        for dataset_info in dataset_infos
    ]
    if sum(not_brainatlas) > 0:
        print('excluding brain atlas datasets...', flush=True)
        dataset_infos = [
            dataset_info
            for dataset_info, nba in zip(dataset_infos, not_brainatlas) if nba
        ]

    # iterate over datasets
    print('iterating over datasets...', flush=True)
    for i, dataset_info in enumerate(dataset_infos):

        # load dataset
        print('loading dataset {0}...'.format(dataset_info['abbreviation']),
              flush=True)
        gene_atb_i = datasetIO.load_datamatrix(
            datasetpath=dataset_info['path'])
        gene_atb_i.columnmeta[
            'generalizability_pvalues_corrected'] = gene_atb_i.columnmeta[
                'generalizability_pvalues_corrected'].astype('float64')
        gene_atb_i.columnmeta['dataset_abbreviation'] = np.full(
            gene_atb_i.shape[1], dataset_info['abbreviation'], dtype='object')
        gene_atb_i.columnmeta[
            'dataset_feature'] = gene_atb_i.columnlabels.copy()
        gene_atb_i.columnlabels += '_' + dataset_info['abbreviation']
        gene_atb_i.rowname = 'GeneSym'
        gene_atb_i.columnname = 'Feature'
        if dataset_info['abbreviation'] == 'gtextissue_cleaned':
            gene_atb_i.discard(gene_atb_i.rowlabels == 'C12ORF55',
                               0)  # pesky duplicate row
        print(gene_atb_i)

        # merge dataset
        print('merging dataset...', flush=True)
        if i == 0:
            gene_atb = copy.deepcopy(gene_atb_i)
            gene_atb.matrixname = 'merged_target_features'
            print('    first dataset, no merge...', flush=True)
        else:
            common_genes = np.intersect1d(gene_atb.rowlabels,
                                          gene_atb_i.rowlabels)
            gene_atb = gene_atb.tolabels(rowlabels=common_genes)
            gene_atb_i = gene_atb_i.tolabels(rowlabels=common_genes)
            gene_atb.append(gene_atb_i, 1)
            print('    common_genes: {0!s}...'.format(common_genes.size),
                  flush=True)

    # specify merged dataset info
    print('specifying merged dataset info...', flush=True)
    dataset_info = {
        'abbreviation': 'merged',
        'name': 'Merged Generalizable Target Features',
        'path': '{0}/{1}.txt.gz'.format(results_folder, 'merged'),
        'feature_normalization': 'min-max',
        'feature_similarity_metric': 'cosine',
        'feature_similarity_threshold': np.sqrt(0.5),
        'genes': gene_atb.shape[0],
        'features': gene_atb.shape[1],
        'positives': (gene_atb.rowmeta['class'] == 'positive').sum(),
        'negatives': (gene_atb.rowmeta['class'] == 'negative').sum(),
        'unknowns': (gene_atb.rowmeta['class'] == 'unknown').sum()
    }
    for field, entry in dataset_info.items():
        print('    {0}: {1!s}'.format(field, entry), flush=True)

    # normalize features
    print('normalizing features...', flush=True)
    gene_atb.columnmeta['min'] = gene_atb.matrix.min(0)
    gene_atb.columnmeta['max'] = gene_atb.matrix.max(0)
    gene_atb.matrix = (gene_atb.matrix - gene_atb.columnmeta['min'].reshape(
        1, -1)) / (gene_atb.columnmeta['max'].reshape(1, -1) -
                   gene_atb.columnmeta['min'].reshape(1, -1))

    # prioritize features
    print('prioritizing features by generalizability_pvalues_corrected...',
          flush=True)
    sortedindices = np.argsort(
        gene_atb.columnmeta['generalizability_pvalues_corrected'])
    gene_atb.reorder(sortedindices, axis=1)

    # calculate feature similarity
    print('calculating feature similarity...', flush=True)
    atb_atb = gene_atb.tosimilarity(
        axis=1, metric=dataset_info['feature_similarity_metric'])

    # prioritize feature groups
    print('prioritizing feature groups...', flush=True)
    are_similar_features = np.abs(
        atb_atb.matrix) > dataset_info['feature_similarity_threshold']
    feature_group_size = are_similar_features.sum(1).astype('float64')
    feature_group_score = (np.abs(atb_atb.matrix) *
                           are_similar_features).sum(1) / feature_group_size
    feature_priority = np.zeros(gene_atb.shape[1], dtype='float64')
    feature_priority[gene_atb.columnmeta['dataset_feature'] == 'mean'] = 1.0
    feature_priority[gene_atb.columnmeta['dataset_feature'] == 'stdv'] = 0.5
    feature_infos = list(
        zip(np.arange(gene_atb.shape[1], dtype='int64'),
            gene_atb.columnlabels.copy(), feature_group_size.copy(),
            feature_priority.copy(), feature_group_score.copy()))
    feature_infos.sort(key=itemgetter(4), reverse=True)
    feature_infos.sort(key=itemgetter(3), reverse=True)
    feature_infos.sort(key=itemgetter(2), reverse=True)
    #        for feature_info in feature_infos:
    #            print('{0:1.3g}, {1}, {2:1.3g}, {3:1.3g}, {4:1.3g}'.format(feature_info[0], feature_info[1], feature_info[2], feature_info[3], feature_info[4]))
    sorted_feature_indices = np.array(
        [feature_info[0] for feature_info in feature_infos], dtype='int64')
    atb_atb.reorder(sorted_feature_indices, axis=0)
    atb_atb.reorder(sorted_feature_indices, axis=1)
    gene_atb.reorder(sorted_feature_indices, axis=1)
    are_similar_features = are_similar_features[
        sorted_feature_indices, :][:, sorted_feature_indices]

    # group similar features
    print('grouping similar features...', flush=True)
    tobediscarded = np.zeros(gene_atb.shape[1], dtype='bool')
    gene_atb.columnmeta['similar_features'] = np.full(gene_atb.shape[1],
                                                      '',
                                                      dtype='object')
    with open('{0}/{1}_feature_groups.txt'.format(
            results_folder, dataset_info['abbreviation']),
              mode='wt',
              encoding='utf-8',
              errors='surrogateescape') as fw:
        for i, feature in enumerate(gene_atb.columnlabels):
            if ~tobediscarded[i]:
                # find similar features
                print(
                    '    finding features similar to feature "{0}"...'.format(
                        feature),
                    flush=True)
                similarity_hit = are_similar_features[i, :]
                similarity_hit = np.logical_and(
                    similarity_hit, ~tobediscarded)  # just what's new
                similarity_hit[:i] = False
                similar_features = gene_atb.columnlabels[similarity_hit]
                similarity_values = atb_atb.matrix[i, similarity_hit]
                generalizability_pvalues_corrected = gene_atb.columnmeta[
                    'generalizability_pvalues_corrected'][similarity_hit]
                si = np.argsort(generalizability_pvalues_corrected)
                similar_features = similar_features[si]
                similarity_values = similarity_values[si]
                generalizability_pvalues_corrected = generalizability_pvalues_corrected[
                    si]
                print(
                    '        similar_feature, similarity_value, generalizability_pvalue_corrected',
                    flush=True)
                for similar_feature, similarity_value, generalizability_pvalue_corrected in zip(
                        similar_features, similarity_values,
                        generalizability_pvalues_corrected):
                    print('        {0}, {1:1.3g}, {2:1.3g}'.format(
                        similar_feature, similarity_value,
                        generalizability_pvalue_corrected),
                          flush=True)
                # replace feature with best similar feature
                j = np.where(
                    gene_atb.columnlabels == similar_features[0])[0][0]
                gene_atb.columnmeta['similar_features'][j] = '|'.join(
                    similar_features.tolist())
                print(
                    '        replacing feature "{0}" with best similar feature "{1}"...'
                    .format(feature, gene_atb.columnlabels[j]),
                    flush=True)
                gene_atb.matrix[:, i] = gene_atb.matrix[:, j]
                gene_atb.columnlabels[i] = gene_atb.columnlabels[j]
                for field in gene_atb.columnmeta.keys():
                    gene_atb.columnmeta[field][i] = gene_atb.columnmeta[field][
                        j]
                fw.write('\t'.join([
                    '{0}|{1:1.6g}|{2:1.6g}'.format(f, s, p)
                    for f, s, p in zip(similar_features, similarity_values,
                                       generalizability_pvalues_corrected)
                ]) + '\n')
                similarity_hit[i] = False
                tobediscarded = np.logical_or(tobediscarded, similarity_hit)

    # discard features absorbed into group features
    print('discarding features absorbed into group features...', flush=True)
    if tobediscarded.any():
        # discard features
        print('    discarding {0!s} features. {1!s} features remaining...'.
              format(tobediscarded.sum(), (~tobediscarded).sum()),
              flush=True)
        gene_atb.discard(tobediscarded, axis=1)
    else:
        # keep all features
        print('    no features to discard. {0!s} features remaining...'.format(
            gene_atb.shape[1]),
              flush=True)

    # save if dataset has content
    print('saving if dataset has content...', flush=True)
    if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0:
        # no content
        print('    nothing to save...', flush=True)
    else:
        # save merged nonredundant features
        print('    saving {0!s} merged nonredundant features...'.format(
            gene_atb.shape[1]),
              flush=True)
        dataset_info['nonredundant_genes'] = gene_atb.shape[0]
        dataset_info['nonredundant_features'] = gene_atb.shape[1]
        datasetIO.save_datamatrix(dataset_info['path'], gene_atb)
        datasetIO.append_datasetinfo(
            '{0}/dataset_info.txt'.format(results_folder), dataset_info)

    print('done.', flush=True)
コード例 #6
0
def main():

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/nonredundant_features/dataset_info.txt'
    dataset_infos = datasetIO.load_datasetinfo(dataset_info_path)

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/significant_features'
    if not os.path.exists(results_folder):
        os.mkdir(results_folder)

    # iterate over datasets
    print('iterating over datasets...', flush=True)
    for dataset_info in dataset_infos:

        #        # just work with hpatissuesmrna for testing/debugging the pipeline
        #        if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned':
        #            print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True)
        #            continue

        # check if another python instance is already working on this dataset
        if os.path.exists('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation'])):
            print('skipping {0}. already in progress...'.format(
                dataset_info['abbreviation']),
                  flush=True)
            continue

        # log start of processing
        with open('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            print('working on {0}...'.format(dataset_info['abbreviation']),
                  flush=True)
            fw.write('working on {0}...'.format(dataset_info['abbreviation']))

        # load dataset
        print('loading dataset...', flush=True)
        gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path'])

        # specify feature significance test parameters
        print('specifying feature significance test parameters...', flush=True)
        dataset_info[
            'feature_significance_test_function'] = featureselection.univariate_permtest
        dataset_info['feature_significance_test_permutations'] = 100000
        dataset_info[
            'multiple_hypothesis_testing_correction_function'] = featureselection.multiple_hypothesis_testing_correction
        dataset_info[
            'multiple_hypothesis_testing_correction_method'] = 'fdr_by'
        dataset_info['multiple_hypothesis_testing_correction_threshold'] = 0.05
        print('   feature_significance_test_function: {0}'.format(
            dataset_info['feature_significance_test_function']),
              flush=True)
        print('   feature_significance_test_permutations: {0!s}'.format(
            dataset_info['feature_significance_test_permutations']),
              flush=True)
        print('   multiple_hypothesis_testing_correction_function: {0}'.format(
            dataset_info['multiple_hypothesis_testing_correction_function']),
              flush=True)
        print('   multiple_hypothesis_testing_correction_method: {0}'.format(
            dataset_info['multiple_hypothesis_testing_correction_method']),
              flush=True)
        print('   multiple_hypothesis_testing_correction_threshold: {0!s}'.
              format(dataset_info[
                  'multiple_hypothesis_testing_correction_threshold']),
              flush=True)

        # compute feature significance with multiple hypothesis testing correction
        print(
            'computing feature significance with multiple hypothesis testing correction...',
            flush=True)
        isunknown = gene_atb.rowmeta['class'] == 'unknown'
        gene_atb.columnmeta['test_statistic_values'], gene_atb.columnmeta[
            'pvalues'] = dataset_info['feature_significance_test_function'](
                X=gene_atb.matrix[~isunknown, :],
                Y=(gene_atb.rowmeta['class'][~isunknown] == 'positive'),
                numperm=dataset_info['feature_significance_test_permutations'])
        gene_atb.columnmeta['is_significant'], gene_atb.columnmeta[
            'pvalues_corrected'] = dataset_info[
                'multiple_hypothesis_testing_correction_function'](
                    gene_atb.columnmeta['pvalues'],
                    alpha=dataset_info[
                        'multiple_hypothesis_testing_correction_threshold'],
                    method=dataset_info[
                        'multiple_hypothesis_testing_correction_method'])
        gene_atb.columnmeta['correlation_sign'] = np.sign(
            gene_atb.columnmeta['test_statistic_values'])
        if (gene_atb.columnmeta['pvalues'] < 1 /
                dataset_info['feature_significance_test_permutations']).any():
            print(
                '    warning: not enough permutations to establish all pvalues...',
                flush=True)
        tobediscarded = np.logical_or(
            np.isnan(gene_atb.columnmeta['pvalues']),
            np.isnan(gene_atb.columnmeta['pvalues_corrected']))
        if tobediscarded.any():
            gene_atb.discard(tobediscarded, axis=1)

        # prioritize features
        print('prioritizing features...', flush=True)
        sortedindices = np.argsort(gene_atb.columnmeta['pvalues_corrected'])
        gene_atb.reorder(sortedindices, axis=1)

        # save feature significance info
        print('saving feature significance info...', flush=True)
        with open('{0}/{1}_feature_significance_info.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            writelist = [
                'dataset', 'abbreviation', 'feature', 'test_statistic',
                'pvalue', 'pvalue_corrected', 'is_significant',
                'correlation_sign', 'preferred_rowstat', 'similar_features'
            ]
            fw.write('\t'.join(writelist) + '\n')
            for j, feature in enumerate(gene_atb.columnlabels):
                writelist = [
                    dataset_info['name'], dataset_info['abbreviation'],
                    feature, '{0:1.5g}'.format(
                        gene_atb.columnmeta['test_statistic_values'][j]),
                    '{0:1.5g}'.format(gene_atb.columnmeta['pvalues'][j]),
                    '{0:1.5g}'.format(
                        gene_atb.columnmeta['pvalues_corrected'][j]),
                    '{0:1.5g}'.format(
                        gene_atb.columnmeta['is_significant'][j]),
                    '{0:1.5g}'.format(
                        gene_atb.columnmeta['correlation_sign'][j]),
                    gene_atb.columnmeta['preferred_rowstat'][j],
                    gene_atb.columnmeta['similar_features'][j]
                ]
                fw.write('\t'.join(writelist) + '\n')

        # discard features that are not significant
        print('discarding features that are not significant...', flush=True)
        tobediscarded = ~gene_atb.columnmeta['is_significant']
        if tobediscarded.any():
            # discard features
            print('    discarding {0!s} features. {1!s} features remaining...'.
                  format(tobediscarded.sum(), (~tobediscarded).sum()),
                  flush=True)
            gene_atb.discard(tobediscarded, axis=1)
        else:
            # keep all features
            print('    no features to discard. {0!s} features remaining...'.
                  format(gene_atb.shape[1]),
                  flush=True)

        # save if dataset has content
        print('saving if dataset has content...', flush=True)
        if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0:
            # no content
            print('    nothing to save...', flush=True)
        else:
            # save significant features
            print('    saving {0!s} significant features...'.format(
                gene_atb.shape[1]),
                  flush=True)
            dataset_info['path'] = '{0}/{1}.txt.gz'.format(
                results_folder, dataset_info['abbreviation'])
            dataset_info['significant_genes'] = gene_atb.shape[0]
            dataset_info['significant_features'] = gene_atb.shape[1]
            dataset_info[
                'feature_significance_test_function'] = 'featureselection.univariate_permtest'
            dataset_info[
                'multiple_hypothesis_testing_correction_function'] = 'featureselection.multiple_hypothesis_testing_correction'
            datasetIO.save_datamatrix(dataset_info['path'], gene_atb)
            datasetIO.append_datasetinfo(
                '{0}/dataset_info.txt'.format(results_folder), dataset_info)

    print('done.', flush=True)