def corr_main(params): """ This is the main backend function which performs the following steps: - takes the user specified, top n variance features - calculates a the correlation between these features - performs correction for multiple testing with the user specified method - saves results, plots matrices as heatmaps and save these figures as well """ # -------------------------------------------------------------------------- # CALCULATE CORRELATIONS # -------------------------------------------------------------------------- # open first dataset path = os.path.join(params['output_folder'], params['dataset1']) dataset1, sep = open_file(path) n, p = dataset1.shape # if there's a 2nd dataset, merge them if not params['autocorr']: path2 = os.path.join(params['output_folder'], params['dataset2']) dataset2, sep2 = open_file(path2) merged_datasets_df = dataset1.join(dataset2, how='inner', lsuffix='_data1', rsuffix='_data2') X = merged_datasets_df.values else: merged_datasets_df = dataset1 X = merged_datasets_df.values # standardise X X = (X - X.mean(0)) / X.std(0)[np.newaxis, :] # calculate Spearman rank correlations and corresponding p-values r_vals, p_vals = sp.stats.spearmanr(X) # correct for multiple testing p_vals, p_mask = check_pvals(p_vals, params) # delete correlations that did not pass the multi test correction r_vals[~p_mask] = 0 p_vals[~p_mask] = 1 # -------------------------------------------------------------------------- # WRITE RESULTS FOR DATA1, DATA2, DATA1-2 # -------------------------------------------------------------------------- params = write_results(params, r_vals[:p, :p], p_vals[:p, :p], (dataset1, dataset1), 'dataset1', True) if not params['autocorr']: params = write_results(params, r_vals[p:, p:], p_vals[p:, p:], (dataset2, dataset2), 'dataset2', True) params = write_results(params, r_vals[:p, p:], p_vals[:p, p:], (dataset1, dataset2), 'dataset1_2') # if corr_done in params is False one of the writing steps failed if 'corr_done' not in params: params['corr_done'] = True return params
def bin_genomic_elements(params, annotation, data): """ Bins genomic elements based on the supplied _bins.txt file. For every element it requires a chromosome str, and start and end pos. """ study_folder = params['study_folder'] bin_file = params['bin_file'] annot_file = params[annotation] # open bin and checked annotation file bins = pd.read_table(bin_file) annot, sep = open_file(os.path.join(study_folder, annot_file)) # bin GEs # extract start of chromosomes chromo_starts = {} for c in list(np.where(bins.ChromoStart != 0)[0]): chromo_starts[bins.Chromosome[c]] = bins.ChromoStart[c] # convert relative start and end positions to absolute starts = ([chromo_starts[x] for x in annot.Chromosome] + annot.Start).values ends = ([chromo_starts[x] for x in annot.Chromosome] + annot.End).values starts_binned = np.digitize(starts, bins.Absolute) ends_binned = np.digitize(ends, bins.Absolute) # check if any genomic element got binned outside diff_bin_pos = (ends_binned - starts_binned) # inTwoBins in_two_bins = np.where(diff_bin_pos == 1)[0] absolute_in_between = (bins.Absolute[ends_binned[in_two_bins] - 1]).values larger_in_end = in_two_bins[np.where( absolute_in_between - starts[in_two_bins] < ends[in_two_bins] - absolute_in_between)[0]] # in more than two bins - really rare: we just assign the bin inbetween in_more_than_two_bins = np.where(diff_bin_pos > 1)[0] # add final bin annotation to annotFile starts_binned[larger_in_end] = ends_binned[larger_in_end] starts_binned[in_more_than_two_bins] += 1 annot.insert(3, 'Bin', starts_binned) # write binned version of annotationFile file_name, file_extension = os.path.splitext(annot_file) annot_binned = file_name.replace('checked', 'binned') + file_extension annot.to_csv(os.path.join(study_folder, annot_binned), sep=sep) # rewrite data files to only contain genomic elements that got binned data_file = params[data] file_name, file_extension = os.path.splitext(data_file) data, sep = open_file(os.path.join(study_folder, data_file)) data_binned = file_name + ('_binned') + file_extension # columns of the data file are the rows of the annotation file data = data[annot.index] data.to_csv(os.path.join(study_folder, data_binned), sep=sep) return annot_binned, data_binned
def genomic_main(params): """ Wrapper for write_vis_Genomic which writes JS files for the vis_genomic. """ # DATA 1 # open necessary files path = os.path.join(params['study_folder'], params['annotation1']) annot1, _ = open_file(path) annot = [annot1, annot1] path = os.path.join(params['output_folder'], params['r_dataset1']) r, _ = open_file(path) path = os.path.join(params['output_folder'], params['p_dataset1']) p, _ = open_file(path) # write network, table files and variables bin2label, row2bin, _ = wn(params, 'dataset1', annot, r, True) wt(params, 'dataset1', annot1, r, p, row2bin, bin2label, axis=1) wt(params, 'dataset1', annot1, r, p, row2bin, bin2label, axis=0) if not params['autocorr']: # DATA 2 path = os.path.join(params['study_folder'], params['annotation2']) annot2, _ = open_file(path) annot = [annot2, annot2] path = os.path.join(params['output_folder'], params['r_dataset2']) r, _ = open_file(path) path = os.path.join(params['output_folder'], params['p_dataset2']) p, _ = open_file(path) # write network, table files and variables _, row2bin, _ = wn(params, 'dataset2', annot, r, True) wt(params, 'dataset2', annot2, r, p, row2bin, bin2label, axis=1) wt(params, 'dataset2', annot2, r, p, row2bin, bin2label, axis=0) # DATA 1_2 annot = [annot1, annot2] path = os.path.join(params['output_folder'], params['r_dataset1_2']) r, _ = open_file(path) path = os.path.join(params['output_folder'], params['p_dataset1_2']) p, _ = open_file(path) # write network, table files and variables _, row2bin, col2bin = wn(params, 'dataset1_2', annot, r) wt(params, 'dataset1_2', annot1, r, p, row2bin, bin2label, axis=1) wt(params, 'dataset1_2', annot2, r, p, col2bin, bin2label, axis=0) return params
def top_variance(params): datasets = ['dataset1'] if not params['autocorr']: datasets.append('dataset2') # this determines how many top variance features we include. 1 means as many # as many samples we have. top_var_ratio = 2 for dataset in datasets: path = os.path.join(params['study_folder'], params[dataset]) X, sep = open_file(path) if int(top_var_ratio * X.shape[0]) < X.shape[1]: # keep only the top N = top_var_ratio * X.shape[0] var features X = X[np.argsort(X.var())[-int(top_var_ratio * X.shape[0]):]] filename, ext = os.path.splitext(params[dataset]) params[dataset] = filename + '_topvar' + ext path = os.path.join(params['output_folder'], params[dataset]) X.to_csv(path, sep=sep) return params
def top_variance(params): """ Selects the user defined number of top features with the highest variance from the datasets. """ datasets = ['dataset1'] feat_num = params['feat_num'] if not params['autocorr']: datasets.append('dataset2') for dataset in datasets: path = os.path.join(params['study_folder'], params[dataset]) X, sep = open_file(path) # keep only the top N var features X = X[np.argsort(X.var())[-int(feat_num):]] filename, ext = os.path.splitext(params[dataset]) params[dataset] = filename + '_topvar' + ext path = os.path.join(params['output_folder'], params[dataset]) X.to_csv(path, sep=sep) params['fs_done'] = True return params
def generate_pair_plots(params, rs, p_vals, datasets, p): """ Generate a scatter plots for each pair of variables in a correlation matrix. """ # setup plots sns.set_style("whitegrid", {"grid.color": ".95"}) # make folders in output img folder (this will be zipped) output_img_folder = params["output_img_folder"] dataset1_folder = os.path.join(output_img_folder, 'dataset1') if not os.path.exists(dataset1_folder): os.makedirs(dataset1_folder) if not params['autocorr']: dataset2_folder = os.path.join(output_img_folder, 'dataset2') if not os.path.exists(dataset2_folder): os.makedirs(dataset2_folder) dataset1_2_folder = os.path.join(output_img_folder, 'dataset1_2') if not os.path.exists(dataset1_2_folder): os.makedirs(dataset1_2_folder) # if fs open metadata file - we'll colour each point by it if params['fs']: metadata_name = params['fs_cols'] path = os.path.join(params['study_folder'], params['metadata_file']) y, _ = open_file(path) y = y[metadata_name] y_ = y.iloc[1:] meta_type = y.iloc[0] # find intersecting samples, filter down y and add it to X's end ind = datasets.index.intersection(y_.index) datasets = datasets.loc[ind] y_ = y_.loc[ind].values datasets.insert(datasets.shape[1], metadata_name, y_) else: # no fs, we cannot colour the scatter plots by anything meta_type = None # heatmap col and row names are truncated in utils.recode_rowcol_names() so # we need to truncate them here as well so they match the feature names threshold = 12 # loop through the lower triangular part of the R matrix and plot each pair lower_tri_ind = np.tril_indices(rs.shape[1], k=-1) for i in xrange(lower_tri_ind[0].shape[0]): y_loc = lower_tri_ind[0][i] x_loc = lower_tri_ind[1][i] r_val = rs[x_loc, y_loc] p_val = p_vals[x_loc, y_loc] if r_val != 0: x_var = datasets.columns[x_loc] y_var = datasets.columns[y_loc] suptitle = r"$R^2$:%s, p-value:%s" % ("{:0.3f}".format(r_val), "{:0.6f}".format(p_val)) # categorical metadata variable plots if meta_type == "cat": g = sns.lmplot(x=x_var, y=y_var, hue=metadata_name, data=datasets, size=4, aspect=1, ci=68, scatter_kws={ 'alpha': 0.6, 's': 40 }, line_kws={ 'alpha': 0.5, 'linewidth': 1.5 }) g.fig.suptitle(suptitle) # # continuous metadata variable plots elif meta_type == "con": cmap = sns.cubehelix_palette(as_cmap=True) g, ax = plt.subplots(figsize=(5, 5)) points = ax.scatter(datasets[x_var], datasets[y_var], c=y_, cmap=cmap) clb = g.colorbar(points) clb.ax.set_title(metadata_name) plt.title(suptitle, loc='left') plt.xlabel(x_var) plt.ylabel(y_var) # no metadata else: g = sns.lmplot(x=x_var, y=y_var, data=datasets, size=4, aspect=1) g.fig.suptitle(suptitle) # find out which correlation sub-matrix the plot belongs to if x_loc < p and y_loc < p: dataset_folder = dataset1_folder elif x_loc >= p and y_loc >= p: dataset_folder = dataset2_folder else: dataset_folder = dataset1_2_folder # save image into output and analysis folders too filename = x_var[:threshold] + "_" + y_var[:threshold] + ".png" plt_output_path = os.path.join(dataset_folder, filename) g.savefig(plt_output_path) # save it into img_folder with commutative name filename = get_png_name(x_var[:threshold], y_var[:threshold]) plt_img_path = os.path.join(params["img_folder"], filename) copy(plt_output_path, plt_img_path) # close current plotting window so we don't use too much memory plt.close()
def filter_genomic(r, p, params, name, sym=False): """ Filters overlapping and distant correlations in genomic datasets. """ # -------------------------------------------------------------------------- # OPEN AND FILTER ANNOTATION FILES # -------------------------------------------------------------------------- if name == 'dataset1': annot1 = annot2 = params['annotation1'] elif name == 'dataset2': annot1 = annot2 = params['annotation2'] else: annot1 = params['annotation1'] annot2 = params['annotation2'] path = os.path.join(params['study_folder'], annot1) annot1, _ = open_file(path) path = os.path.join(params['study_folder'], annot2) annot2, _ = open_file(path) annot1 = annot1.loc[r.index] annot2 = annot2.loc[r.columns] # -------------------------------------------------------------------------- # OPEN BIN AND CHROMO FILE, BUILD DICTS FROM THEM # -------------------------------------------------------------------------- bin_file = params['bin_file'] chromo_file = ''.join(bin_file.split('__')[:-1]) + '__chromosomes.txt' bins = pd.read_table(bin_file) chromo = pd.read_table(chromo_file) chr_starts = {} chr2num = {c: ci for ci, c in enumerate(chromo.Chromosome)} for i in bins.index: row = bins.loc[i] if row.Chromosome not in chr_starts: chr_starts[row.Chromosome] = int(row.ChromoStart) # -------------------------------------------------------------------------- # HELPER FUNCTIONS FOR DISCARD AND CONSTRAIN # -------------------------------------------------------------------------- def check_edge(row, col): # get location of source and target GE c1, s1, e1 = get_location(annot1, row) c2, s2, e2 = get_location(annot2, col) # do we need to discard overlapping GEs? True means we're good to go. if params['discard_overlap']: overlap = overlap_test(c1, c2, s1, s2, e1, e2) else: overlap = True # do we need to restrict corrs to a certain distance? True is good. if params['constrain_corr']: dist = params['constrain_dist'] constrain = constrain_test(c1, c2, s1, s2, e1, e2, dist) else: constrain = True if overlap and constrain: return True else: return False def get_location(annot, row): row = annot.loc[row] return row.Chromosome, row.Start, row.End def overlap_test(c1, c2, s1, s2, e1, e2): """ Check if two genomic elements overlap. If not return True. """ # if they're on different chromosomes they cannot overlap if c1 != c2: return True else: # we will assume that s1 is smaller, otherwise swap them if s1 > s2: s1, s2 = s2, s1 e1, e2 = e2, e1 if e1 > s2: return False else: return True def constrain_test(c1, c2, s1, s2, e1, e2, dist): """ Check if two genomic elements are within a distance specified by dist in Mbps. If yes, return True. """ dist = int(dist) # if dist == 0 GEs need to be on the same chromosome if dist == 0: if c1 == c2: return True else: return False # otherwise dist is provided in Mbps but bin files use raw bp nums elif dist > 0: dist = int(dist * 10000000) # they are on the same chromosome if c1 == c2: # we will assume s1 is closer to start of c1, if not swap them if s1 > s2: s1, s2 = s2, s1 e1, e2 = e2, e1 if e1 + dist > s2: return True else: return False # they are on different chromosomes (we need their ranked pos) elif chr2num[c1] > chr2num[c2]: # we will assume the c1 is closer to chr1, if not swap them c1, c2 = c2, c1 s1, s2 = s2, s1 e1, e2 = e2, e1 # get absolute location of 1st GE's end and 2nd GE's start abs_e1 = chr_starts[c1] + e1 abs_s2 = chr_starts[c2] + s2 if abs_e1 + dist > abs_s2: return True else: return False # -------------------------------------------------------------------------- # CHECK EDGES IN R FOR OVERLAPPING AND DISTANT CORRS # -------------------------------------------------------------------------- for ri, row in enumerate(r.index): for ci, col in enumerate(r.columns): # if sym, only use the upper triangle of the r matrix if not sym or (sym and ci > ri): cell = r[col].loc[row] if cell != 0: # if didn't pass the check, set it cell to 0 if not check_edge(row, col): r[col].loc[row] = 0 p[col].loc[row] = 1 if sym: r[row].loc[col] = 0 p[row].loc[col] = 1 return r, p
def do_fs(params): # setup basic vars for feature selection datasets = ['dataset1'] if not params['autocorr']: datasets.append('dataset2') dataset_names = [] dataset_num = [] metadata_names = [] all_selected = [] le = LabelEncoder() # open metadata file, define scaler, fs method path = os.path.join(params['study_folder'], params['metadata_file']) y, _ = open_file(path) y = y[params['fs_cols']] ss = StandardScaler() method = params['fs_method'] # perform fs on dataset(s) for dataset in datasets: path = os.path.join(params['output_folder'], params[dataset]) X, sep = open_file(path) # drop NaNs in metadata column y_ = y.iloc[1:].dropna() meta_type = y.iloc[0] # find intersecting samples and filter down X and y and call fs_() ind = X.index.intersection(y_.index) X_ = X.loc[ind] y_ = y_.loc[ind].values.reshape(-1, 1) # scale X to 0 mean and unit variance X_ = ss.fit_transform(X_.values) if meta_type == 'cat': # encode categorical values as numbers y_ = le.fit_transform(y_) selected = fs_categorical(X_, y_, method) else: # scale y to 0 mean and unit variance y_ = ss.fit_transform(y_) # override Boruta and JMI with L1 if continuous if method in ['Boruta', 'JMI']: selected = fs_continuous(X_, y_, 'L1') else: selected = fs_continuous(X_, y_, method) # we need at least 5 selected features per dataset if selected is None: selected = [] if len(selected) <= 5: params['fs_done'] = False return params # saving filtered X into output folder filename, ext = os.path.splitext(params[dataset]) params[dataset] = filename.replace('topvar', 'fs') + ext X_sel = X.iloc[:, selected] X_sel.to_csv(os.path.join(params['output_folder'], params[dataset]), sep=sep) # saving results for selected_features.csv dataset_names.append(params[dataset]) dataset_num.append(dataset) metadata_names.append(params['fs_cols']) all_selected.append('|'.join(map(str, np.array(selected)))) # writing selected_features.csv results = zip(dataset_names, dataset_num, metadata_names, all_selected) cols = ['dataset_name', 'dataset_num', 'metadata_name', 'selected'] selected_file = os.path.join(params['output_folder'], 'selected_features.csv') pd.DataFrame(results, columns=cols).to_csv(selected_file) params['fs_done'] = True return params
def corr_main(params): """ This is the main function which performs the following steps: - open dataset(s), load selected features, merge datasets - perform GLASSO with huge R package - calculated permuted p-values with GPD approximation in parallel - correct for multiple testing - save r and p value matrices for users - save networks from r values for users - write variables and datasets for visualisation in JS """ # -------------------------------------------------------------------------- # CALCULATE GRAPHLASSO AND PERMUTED P-VALS # -------------------------------------------------------------------------- # open first dataset path = os.path.join(params['output_folder'], params['dataset1']) dataset1, sep = open_file(path) n, p = dataset1.shape # if there's a 2nd dataset, merge them if not params['autocorr']: path2 = os.path.join(params['output_folder'], params['dataset2']) dataset2, sep2 = open_file(path2) # if two featres has the same name we need prefixes merged_datasets_df = dataset1.join(dataset2, how='inner', lsuffix='_data1', rsuffix='_data2') X = merged_datasets_df.values else: merged_datasets_df = dataset1 X = merged_datasets_df.values # standardise X ss = StandardScaler() X = ss.fit_transform(X) # perform GLASSO with huge in R lambda_threshold = params['lambda_val'] cov, prec = hugeR.hugeR(X, lambda_threshold) # create column ranked X for corr_permutation rX = bn.rankdata(X, axis=0) # get GPD approximated p-values perm_num = 10000 rs, p_vals, p_mask = cp.gpd_spearman(rX, perm_num=perm_num, prec=prec, mc_method=params['multi_corr_method'], mc_alpha=params['alpha_val']) # delete correlations that did not pass the multi test correction rs[~p_mask] = 0 p_vals[~p_mask] = 1 # -------------------------------------------------------------------------- # CHECK IF GENOMIC FILTERING IS NEEDED # -------------------------------------------------------------------------- # if fs, load metadata column for fold_change calculation later if params['fs']: path = os.path.join(params['study_folder'], params['metadata_file']) y, _ = open_file(path) y = y[params['fs_cols']].iloc[1:].dropna() else: y = None # if genomic, check if filtering overlapping and distant corrs needed discard_or_constrain = params['discard_overlap'] or params['constrain_corr'] if params['annotation'] and discard_or_constrain: genomic = True else: genomic = False # -------------------------------------------------------------------------- # GENERATE PAIRWISE PLOTS FOR DATA1, DATA2, DATA1-2 # -------------------------------------------------------------------------- generate_pair_plots(params, rs, p_vals, merged_datasets_df, p) # -------------------------------------------------------------------------- # WRITE RESULTS FOR DATA1, DATA2, DATA1-2 # -------------------------------------------------------------------------- params = write_results(params, rs[:p, :p], p_vals[:p, :p], genomic, (dataset1, dataset1), 'dataset1', y, True) if not params['autocorr']: params = write_results(params, rs[p:, p:], p_vals[p:, p:], genomic, (dataset2, dataset2), 'dataset2', y, True) params = write_results(params, rs[:p, p:], p_vals[:p, p:], genomic, (dataset1, dataset2), 'dataset1_2', y) # if corr_done in params is False one of the writing steps failed if 'corr_done' not in params: params['corr_done'] = True return params