コード例 #1
0
ファイル: corr.py プロジェクト: xj-xu/science_flask
def corr_main(params):
    """
    This is the main backend function which performs the following steps:
    - takes the user specified, top n variance features
    - calculates a the correlation between these features
    - performs correction for multiple testing with the user specified method
    - saves results, plots matrices as heatmaps and save these figures as well
    """

    # --------------------------------------------------------------------------
    # CALCULATE CORRELATIONS
    # --------------------------------------------------------------------------

    # open first dataset
    path = os.path.join(params['output_folder'], params['dataset1'])
    dataset1, sep = open_file(path)
    n, p = dataset1.shape
    # if there's a 2nd dataset, merge them
    if not params['autocorr']:
        path2 = os.path.join(params['output_folder'], params['dataset2'])
        dataset2, sep2 = open_file(path2)
        merged_datasets_df = dataset1.join(dataset2,
                                           how='inner',
                                           lsuffix='_data1',
                                           rsuffix='_data2')
        X = merged_datasets_df.values
    else:
        merged_datasets_df = dataset1
        X = merged_datasets_df.values

    # standardise X
    X = (X - X.mean(0)) / X.std(0)[np.newaxis, :]

    # calculate Spearman rank correlations and corresponding p-values
    r_vals, p_vals = sp.stats.spearmanr(X)

    # correct for multiple testing
    p_vals, p_mask = check_pvals(p_vals, params)

    # delete correlations that did not pass the multi test correction
    r_vals[~p_mask] = 0
    p_vals[~p_mask] = 1

    # --------------------------------------------------------------------------
    # WRITE RESULTS FOR DATA1, DATA2, DATA1-2
    # --------------------------------------------------------------------------

    params = write_results(params, r_vals[:p, :p], p_vals[:p, :p],
                           (dataset1, dataset1), 'dataset1', True)
    if not params['autocorr']:
        params = write_results(params, r_vals[p:, p:], p_vals[p:, p:],
                               (dataset2, dataset2), 'dataset2', True)
        params = write_results(params, r_vals[:p, p:], p_vals[:p, p:],
                               (dataset1, dataset2), 'dataset1_2')

    # if corr_done in params is False one of the writing steps failed
    if 'corr_done' not in params:
        params['corr_done'] = True
    return params
コード例 #2
0
def bin_genomic_elements(params, annotation, data):
    """
    Bins genomic elements based on the supplied _bins.txt file.

    For every element it requires a chromosome str, and start and end pos.
    """
    study_folder = params['study_folder']
    bin_file = params['bin_file']
    annot_file = params[annotation]
    # open bin and checked annotation file
    bins = pd.read_table(bin_file)
    annot, sep = open_file(os.path.join(study_folder, annot_file))

    # bin GEs
    # extract start of chromosomes
    chromo_starts = {}
    for c in list(np.where(bins.ChromoStart != 0)[0]):
        chromo_starts[bins.Chromosome[c]] = bins.ChromoStart[c]

    # convert relative start and end positions to absolute
    starts = ([chromo_starts[x] for x in annot.Chromosome] + annot.Start).values
    ends = ([chromo_starts[x] for x in annot.Chromosome] + annot.End).values
    starts_binned = np.digitize(starts, bins.Absolute)
    ends_binned = np.digitize(ends, bins.Absolute)

    # check if any genomic element got binned outside
    diff_bin_pos = (ends_binned - starts_binned)

    # inTwoBins
    in_two_bins = np.where(diff_bin_pos == 1)[0]
    absolute_in_between = (bins.Absolute[ends_binned[in_two_bins] - 1]).values
    larger_in_end = in_two_bins[np.where(
        absolute_in_between - starts[in_two_bins] <
        ends[in_two_bins] - absolute_in_between)[0]]
    # in more than two bins - really rare: we just assign the bin inbetween
    in_more_than_two_bins = np.where(diff_bin_pos > 1)[0]

    # add final bin annotation to annotFile
    starts_binned[larger_in_end] = ends_binned[larger_in_end]
    starts_binned[in_more_than_two_bins] += 1
    annot.insert(3, 'Bin', starts_binned)

    # write binned version of annotationFile
    file_name, file_extension = os.path.splitext(annot_file)
    annot_binned = file_name.replace('checked', 'binned') + file_extension
    annot.to_csv(os.path.join(study_folder, annot_binned), sep=sep)

    # rewrite data files to only contain genomic elements that got binned
    data_file = params[data]
    file_name, file_extension = os.path.splitext(data_file)
    data, sep = open_file(os.path.join(study_folder, data_file))
    data_binned = file_name + ('_binned') + file_extension
    # columns of the data file are the rows of the annotation file
    data = data[annot.index]
    data.to_csv(os.path.join(study_folder, data_binned), sep=sep)

    return annot_binned, data_binned
コード例 #3
0
def genomic_main(params):
    """
    Wrapper for write_vis_Genomic which writes JS files for the vis_genomic.
    """

    # DATA 1
    # open necessary files
    path = os.path.join(params['study_folder'], params['annotation1'])
    annot1, _ = open_file(path)
    annot = [annot1, annot1]
    path = os.path.join(params['output_folder'], params['r_dataset1'])
    r, _ = open_file(path)
    path = os.path.join(params['output_folder'], params['p_dataset1'])
    p, _ = open_file(path)

    # write network, table files and variables
    bin2label, row2bin, _ = wn(params, 'dataset1', annot, r, True)
    wt(params, 'dataset1', annot1, r, p, row2bin, bin2label, axis=1)
    wt(params, 'dataset1', annot1, r, p, row2bin, bin2label, axis=0)

    if not params['autocorr']:
        # DATA 2
        path = os.path.join(params['study_folder'], params['annotation2'])
        annot2, _ = open_file(path)
        annot = [annot2, annot2]
        path = os.path.join(params['output_folder'], params['r_dataset2'])
        r, _ = open_file(path)
        path = os.path.join(params['output_folder'], params['p_dataset2'])
        p, _ = open_file(path)

        # write network, table files and variables
        _, row2bin, _ = wn(params, 'dataset2', annot, r, True)
        wt(params, 'dataset2', annot2, r, p, row2bin, bin2label, axis=1)
        wt(params, 'dataset2', annot2, r, p, row2bin, bin2label, axis=0)

        # DATA 1_2
        annot = [annot1, annot2]
        path = os.path.join(params['output_folder'], params['r_dataset1_2'])
        r, _ = open_file(path)
        path = os.path.join(params['output_folder'], params['p_dataset1_2'])
        p, _ = open_file(path)

        # write network, table files and variables
        _, row2bin, col2bin = wn(params, 'dataset1_2', annot, r)
        wt(params, 'dataset1_2', annot1, r, p, row2bin, bin2label, axis=1)
        wt(params, 'dataset1_2', annot2, r, p, col2bin, bin2label, axis=0)
    return params
コード例 #4
0
def top_variance(params):
    datasets = ['dataset1']
    if not params['autocorr']:
        datasets.append('dataset2')
    # this determines how many top variance features we include. 1 means as many
    # as many samples we have.
    top_var_ratio = 2
    for dataset in datasets:
        path = os.path.join(params['study_folder'], params[dataset])
        X, sep = open_file(path)
        if int(top_var_ratio * X.shape[0]) < X.shape[1]:
            # keep only the top N = top_var_ratio * X.shape[0] var features
            X = X[np.argsort(X.var())[-int(top_var_ratio * X.shape[0]):]]
            filename, ext = os.path.splitext(params[dataset])
            params[dataset] = filename + '_topvar' + ext
            path = os.path.join(params['output_folder'], params[dataset])
            X.to_csv(path, sep=sep)
    return params
コード例 #5
0
ファイル: top_var.py プロジェクト: xj-xu/science_flask
def top_variance(params):
    """
    Selects the user defined number of top features with the highest variance
    from the datasets.
    """
    datasets = ['dataset1']
    feat_num = params['feat_num']
    if not params['autocorr']:
        datasets.append('dataset2')
    for dataset in datasets:
        path = os.path.join(params['study_folder'], params[dataset])
        X, sep = open_file(path)
        # keep only the top N var features
        X = X[np.argsort(X.var())[-int(feat_num):]]
        filename, ext = os.path.splitext(params[dataset])
        params[dataset] = filename + '_topvar' + ext
        path = os.path.join(params['output_folder'], params[dataset])
        X.to_csv(path, sep=sep)
    params['fs_done'] = True
    return params
コード例 #6
0
ファイル: pairplots.py プロジェクト: bkbonde/CorrMapper
def generate_pair_plots(params, rs, p_vals, datasets, p):
    """
    Generate a scatter plots for each pair of variables in a correlation matrix.
    """
    # setup plots
    sns.set_style("whitegrid", {"grid.color": ".95"})

    # make folders in output img folder (this will be zipped)
    output_img_folder = params["output_img_folder"]
    dataset1_folder = os.path.join(output_img_folder, 'dataset1')
    if not os.path.exists(dataset1_folder):
        os.makedirs(dataset1_folder)
    if not params['autocorr']:
        dataset2_folder = os.path.join(output_img_folder, 'dataset2')
        if not os.path.exists(dataset2_folder):
            os.makedirs(dataset2_folder)
        dataset1_2_folder = os.path.join(output_img_folder, 'dataset1_2')
        if not os.path.exists(dataset1_2_folder):
            os.makedirs(dataset1_2_folder)

    # if fs open metadata file - we'll colour each point by it
    if params['fs']:
        metadata_name = params['fs_cols']
        path = os.path.join(params['study_folder'], params['metadata_file'])
        y, _ = open_file(path)
        y = y[metadata_name]
        y_ = y.iloc[1:]
        meta_type = y.iloc[0]

        # find intersecting samples, filter down y and add it to X's end
        ind = datasets.index.intersection(y_.index)
        datasets = datasets.loc[ind]
        y_ = y_.loc[ind].values
        datasets.insert(datasets.shape[1], metadata_name, y_)
    else:
        # no fs, we cannot colour the scatter plots by anything
        meta_type = None

    # heatmap col and row names are truncated in utils.recode_rowcol_names() so
    # we need to truncate them here as well so they match the feature names
    threshold = 12

    # loop through the lower triangular part of the R matrix and plot each pair
    lower_tri_ind = np.tril_indices(rs.shape[1], k=-1)
    for i in xrange(lower_tri_ind[0].shape[0]):
        y_loc = lower_tri_ind[0][i]
        x_loc = lower_tri_ind[1][i]
        r_val = rs[x_loc, y_loc]
        p_val = p_vals[x_loc, y_loc]
        if r_val != 0:
            x_var = datasets.columns[x_loc]
            y_var = datasets.columns[y_loc]
            suptitle = r"$R^2$:%s, p-value:%s" % ("{:0.3f}".format(r_val),
                                                  "{:0.6f}".format(p_val))
            # categorical metadata variable plots
            if meta_type == "cat":
                g = sns.lmplot(x=x_var,
                               y=y_var,
                               hue=metadata_name,
                               data=datasets,
                               size=4,
                               aspect=1,
                               ci=68,
                               scatter_kws={
                                   'alpha': 0.6,
                                   's': 40
                               },
                               line_kws={
                                   'alpha': 0.5,
                                   'linewidth': 1.5
                               })
                g.fig.suptitle(suptitle)
            # # continuous metadata variable plots
            elif meta_type == "con":
                cmap = sns.cubehelix_palette(as_cmap=True)
                g, ax = plt.subplots(figsize=(5, 5))
                points = ax.scatter(datasets[x_var],
                                    datasets[y_var],
                                    c=y_,
                                    cmap=cmap)
                clb = g.colorbar(points)
                clb.ax.set_title(metadata_name)
                plt.title(suptitle, loc='left')
                plt.xlabel(x_var)
                plt.ylabel(y_var)
            # no metadata
            else:
                g = sns.lmplot(x=x_var,
                               y=y_var,
                               data=datasets,
                               size=4,
                               aspect=1)
                g.fig.suptitle(suptitle)

            # find out which correlation sub-matrix the plot belongs to
            if x_loc < p and y_loc < p:
                dataset_folder = dataset1_folder
            elif x_loc >= p and y_loc >= p:
                dataset_folder = dataset2_folder
            else:
                dataset_folder = dataset1_2_folder

            # save image into output and analysis folders too
            filename = x_var[:threshold] + "_" + y_var[:threshold] + ".png"
            plt_output_path = os.path.join(dataset_folder, filename)
            g.savefig(plt_output_path)
            # save it into img_folder with commutative name
            filename = get_png_name(x_var[:threshold], y_var[:threshold])
            plt_img_path = os.path.join(params["img_folder"], filename)
            copy(plt_output_path, plt_img_path)
            # close current plotting window so we don't use too much memory
            plt.close()
コード例 #7
0
def filter_genomic(r, p, params, name, sym=False):
    """
    Filters overlapping and distant correlations in genomic datasets.
    """
    # --------------------------------------------------------------------------
    # OPEN AND FILTER ANNOTATION FILES
    # --------------------------------------------------------------------------

    if name == 'dataset1':
        annot1 = annot2 = params['annotation1']
    elif name == 'dataset2':
        annot1 = annot2 = params['annotation2']
    else:
        annot1 = params['annotation1']
        annot2 = params['annotation2']

    path = os.path.join(params['study_folder'], annot1)
    annot1, _ = open_file(path)
    path = os.path.join(params['study_folder'], annot2)
    annot2, _ = open_file(path)

    annot1 = annot1.loc[r.index]
    annot2 = annot2.loc[r.columns]

    # --------------------------------------------------------------------------
    # OPEN BIN AND CHROMO FILE, BUILD DICTS FROM THEM
    # --------------------------------------------------------------------------

    bin_file = params['bin_file']
    chromo_file = ''.join(bin_file.split('__')[:-1]) + '__chromosomes.txt'
    bins = pd.read_table(bin_file)
    chromo = pd.read_table(chromo_file)
    chr_starts = {}
    chr2num = {c: ci for ci, c in enumerate(chromo.Chromosome)}
    for i in bins.index:
        row = bins.loc[i]
        if row.Chromosome not in chr_starts:
            chr_starts[row.Chromosome] = int(row.ChromoStart)

    # --------------------------------------------------------------------------
    # HELPER FUNCTIONS FOR DISCARD AND CONSTRAIN
    # --------------------------------------------------------------------------

    def check_edge(row, col):
        # get location of source and target GE
        c1, s1, e1 = get_location(annot1, row)
        c2, s2, e2 = get_location(annot2, col)

        # do we need to discard overlapping GEs? True means we're good to go.
        if params['discard_overlap']:
            overlap = overlap_test(c1, c2, s1, s2, e1, e2)
        else:
            overlap = True
        # do we need to restrict corrs to a certain distance? True is good.
        if params['constrain_corr']:
            dist = params['constrain_dist']
            constrain = constrain_test(c1, c2, s1, s2, e1, e2, dist)
        else:
            constrain = True

        if overlap and constrain:
            return True
        else:
            return False

    def get_location(annot, row):
        row = annot.loc[row]
        return row.Chromosome, row.Start, row.End

    def overlap_test(c1, c2, s1, s2, e1, e2):
        """
        Check if two genomic elements overlap. If not return True.
        """
        # if they're on different chromosomes they cannot overlap
        if c1 != c2:
            return True
        else:
            # we will assume that s1 is smaller, otherwise swap them
            if s1 > s2:
                s1, s2 = s2, s1
                e1, e2 = e2, e1
            if e1 > s2:
                return False
            else:
                return True

    def constrain_test(c1, c2, s1, s2, e1, e2, dist):
        """
        Check if two genomic elements are within a distance specified by dist in
        Mbps. If yes, return True.
        """
        dist = int(dist)
        # if dist == 0 GEs need to be on the same chromosome
        if dist == 0:
            if c1 == c2:
                return True
            else:
                return False
        # otherwise dist is provided in Mbps but bin files use raw bp nums
        elif dist > 0:
            dist = int(dist * 10000000)
            # they are on the same chromosome
            if c1 == c2:
                # we will assume s1 is closer to start of c1, if not swap them
                if s1 > s2:
                    s1, s2 = s2, s1
                    e1, e2 = e2, e1
                if e1 + dist > s2:
                    return True
                else:
                    return False

            # they are on different chromosomes (we need their ranked pos)
            elif chr2num[c1] > chr2num[c2]:
                # we will assume the c1 is closer to chr1, if not swap them
                c1, c2 = c2, c1
                s1, s2 = s2, s1
                e1, e2 = e2, e1

            # get absolute location of 1st GE's end and 2nd GE's start
            abs_e1 = chr_starts[c1] + e1
            abs_s2 = chr_starts[c2] + s2
            if abs_e1 + dist > abs_s2:
                return True
            else:
                return False

    # --------------------------------------------------------------------------
    # CHECK EDGES IN R FOR OVERLAPPING AND DISTANT CORRS
    # --------------------------------------------------------------------------

    for ri, row in enumerate(r.index):
        for ci, col in enumerate(r.columns):
            # if sym, only use the upper triangle of the r matrix
            if not sym or (sym and ci > ri):
                cell = r[col].loc[row]
                if cell != 0:
                    # if didn't pass the check, set it cell to 0
                    if not check_edge(row, col):
                        r[col].loc[row] = 0
                        p[col].loc[row] = 1
                        if sym:
                            r[row].loc[col] = 0
                            p[row].loc[col] = 1
    return r, p
コード例 #8
0
def do_fs(params):
    # setup basic vars for feature selection
    datasets = ['dataset1']
    if not params['autocorr']:
        datasets.append('dataset2')
    dataset_names = []
    dataset_num = []
    metadata_names = []
    all_selected = []
    le = LabelEncoder()

    # open metadata file, define scaler, fs method
    path = os.path.join(params['study_folder'], params['metadata_file'])
    y, _ = open_file(path)
    y = y[params['fs_cols']]
    ss = StandardScaler()
    method = params['fs_method']

    # perform fs on dataset(s)
    for dataset in datasets:
        path = os.path.join(params['output_folder'], params[dataset])
        X, sep = open_file(path)

        # drop NaNs in metadata column
        y_ = y.iloc[1:].dropna()
        meta_type = y.iloc[0]

        # find intersecting samples and filter down X and y and call fs_()
        ind = X.index.intersection(y_.index)
        X_ = X.loc[ind]
        y_ = y_.loc[ind].values.reshape(-1, 1)

        # scale X to 0 mean and unit variance
        X_ = ss.fit_transform(X_.values)

        if meta_type == 'cat':
            # encode categorical values as numbers
            y_ = le.fit_transform(y_)
            selected = fs_categorical(X_, y_, method)
        else:
            # scale y to 0 mean and unit variance
            y_ = ss.fit_transform(y_)
            # override Boruta and JMI with L1 if continuous
            if method in ['Boruta', 'JMI']:
                selected = fs_continuous(X_, y_, 'L1')
            else:
                selected = fs_continuous(X_, y_, method)

        # we need at least 5 selected features per dataset
        if selected is None:
            selected = []
        if len(selected) <= 5:
            params['fs_done'] = False
            return params

        # saving filtered X into output folder
        filename, ext = os.path.splitext(params[dataset])
        params[dataset] = filename.replace('topvar', 'fs') + ext
        X_sel = X.iloc[:, selected]
        X_sel.to_csv(os.path.join(params['output_folder'], params[dataset]),
                     sep=sep)

        # saving results for selected_features.csv
        dataset_names.append(params[dataset])
        dataset_num.append(dataset)
        metadata_names.append(params['fs_cols'])
        all_selected.append('|'.join(map(str, np.array(selected))))

    # writing selected_features.csv
    results = zip(dataset_names, dataset_num, metadata_names, all_selected)
    cols = ['dataset_name', 'dataset_num', 'metadata_name', 'selected']
    selected_file = os.path.join(params['output_folder'],
                                 'selected_features.csv')
    pd.DataFrame(results, columns=cols).to_csv(selected_file)
    params['fs_done'] = True
    return params
コード例 #9
0
def corr_main(params):
    """
    This is the main function which performs the following steps:
    - open dataset(s), load selected features, merge datasets
    - perform GLASSO with huge R package
    - calculated permuted p-values with GPD approximation in parallel
    - correct for multiple testing
    - save r and p value matrices for users
    - save networks from r values for users
    - write variables and datasets for visualisation in JS
    """

    # --------------------------------------------------------------------------
    # CALCULATE GRAPHLASSO AND PERMUTED P-VALS
    # --------------------------------------------------------------------------

    # open first dataset
    path = os.path.join(params['output_folder'], params['dataset1'])
    dataset1, sep = open_file(path)
    n, p = dataset1.shape
    # if there's a 2nd dataset, merge them
    if not params['autocorr']:
        path2 = os.path.join(params['output_folder'], params['dataset2'])
        dataset2, sep2 = open_file(path2)
        # if two featres has the same name we need prefixes
        merged_datasets_df = dataset1.join(dataset2, how='inner',
                                           lsuffix='_data1', rsuffix='_data2')
        X = merged_datasets_df.values
    else:
        merged_datasets_df = dataset1
        X = merged_datasets_df.values

    # standardise X
    ss = StandardScaler()
    X = ss.fit_transform(X)

    # perform GLASSO with huge in R
    lambda_threshold = params['lambda_val']
    cov, prec = hugeR.hugeR(X, lambda_threshold)

    # create column ranked X for corr_permutation
    rX = bn.rankdata(X, axis=0)

    # get GPD approximated p-values
    perm_num = 10000
    rs, p_vals, p_mask = cp.gpd_spearman(rX, perm_num=perm_num, prec=prec,
                                         mc_method=params['multi_corr_method'],
                                         mc_alpha=params['alpha_val'])

    # delete correlations that did not pass the multi test correction
    rs[~p_mask] = 0
    p_vals[~p_mask] = 1

    # --------------------------------------------------------------------------
    # CHECK IF GENOMIC FILTERING IS NEEDED
    # --------------------------------------------------------------------------

    # if fs, load metadata column for fold_change calculation later
    if params['fs']:
        path = os.path.join(params['study_folder'], params['metadata_file'])
        y, _ = open_file(path)
        y = y[params['fs_cols']].iloc[1:].dropna()
    else:
        y = None

    # if genomic, check if filtering overlapping and distant corrs needed
    discard_or_constrain = params['discard_overlap'] or params['constrain_corr']
    if params['annotation'] and discard_or_constrain:
        genomic = True
    else:
        genomic = False

    # --------------------------------------------------------------------------
    # GENERATE PAIRWISE PLOTS FOR DATA1, DATA2, DATA1-2
    # --------------------------------------------------------------------------

    generate_pair_plots(params, rs, p_vals, merged_datasets_df, p)

    # --------------------------------------------------------------------------
    # WRITE RESULTS FOR DATA1, DATA2, DATA1-2
    # --------------------------------------------------------------------------

    params = write_results(params, rs[:p, :p], p_vals[:p, :p], genomic,
                           (dataset1, dataset1), 'dataset1', y, True)
    if not params['autocorr']:
        params = write_results(params, rs[p:, p:], p_vals[p:, p:], genomic,
                               (dataset2, dataset2), 'dataset2', y, True)
        params = write_results(params, rs[:p, p:], p_vals[:p, p:], genomic,
                               (dataset1, dataset2), 'dataset1_2', y)

    # if corr_done in params is False one of the writing steps failed
    if 'corr_done' not in params:
        params['corr_done'] = True
    return params