def capture_ancestry(self, num_of_pcs=2, covars_to_regress=None, save_file=None): logging.info("Running EPISTRUCTURE...") logging.info("Removing non-informative sites...") self.meth_data.include(self.informative_sites) covars = self.meth_data.get_covariates_subset(covars_to_regress) if covars is not None: logging.info("Regressing out covariates...") self.meth_data.regress_out(covars) else: logging.info("Ignoring covariates...") logging.info("Running PCA...") pca_out = pca.PCA(self.meth_data.data.transpose( )) # meth_data should be transposed before passing to pca if save_file: output_filename = save_file pcs = pca_out.P[:, range(num_of_pcs)] data_to_save = column_stack((self.meth_data.samples_ids, pcs)) fmt = '%-12s' + '\t%-12s' * num_of_pcs savetxt(output_filename, data_to_save, fmt=fmt) # saves it as samples X PCs logging.info("The first %s PCs were saved to %s." % (num_of_pcs, output_filename)) self.components = pcs
def low_rank_approximation(O, k): """ O dimensions are n X m """ import time a = time.time() pca_out = pca.PCA(O) b = time.time() res = dot(pca_out.P[:, 0:k], pca_out.U[:, 0:k].transpose()) c = time.time() logging.debug("PCA TOOK %s SECONDS AND DOT(MULTI) TOOK %s SECONDS" % (b - a, c - b)) return res
def __init__(self): logging.info("Testing Started on PCATester") pca_res_p = loadtxt(self.PCA_P_RES) meth_data = methylation_data.MethylationDataLoader( datafile=self.DATA_FILE) pca_out = pca.PCA(meth_data.data.transpose()) for i in range(10): assert tools.correlation(pca_out.P[:, i], pca_res_p[:, i]) logging.info("PASS") logging.info("Testing Finished on PCATester")
def exclude_maxpcstds(self, pcstds): """ pcstds is a list of lists (or tuples) where the first index is the pc_index and the second index is the std_num exclude samples that have std above std_num times higer or lower than the pc std on every std_index for example, let pcstds be [(1,3),(5,4)] - that will exclude samples that have std above 3 or below -3 on pc 1 and above 4 or below -4 in pc 5 """ pca_out = pca.PCA(self.data.transpose()) maxpcstds_samples_indices = set() for pc_index, std_num in pcstds: logging.info("Finding samples with standard deviation (STD) higher than %d STDs or lower than %d STDs in principal component number %d..." % (std_num, -1 * std_num, pc_index)) pc = pca_out.P[:,pc_index-1] # user start counting from 1 and python from 0 std_pc = std(pc) maxpcstds_samples_indices.update(where((pc > std_num * std_pc) | (pc < -1 * std_num * std_pc))[0]) if maxpcstds_samples_indices: logging.info("Excluding samples according to STDs...") self.remove_samples_indices(list(maxpcstds_samples_indices))
def run(self, args, meth_data): # run pca and plot PCs output_filename = args.out if args.out else self.SCATTER_OUTPUT_FILE try: assert args.numpcs + 1 < meth_data.samples_size logging.info("Running PCA...") pca_out = pca.PCA(meth_data.data.transpose( )) # meth_data should be transposed before passing to pca logging.info("Plotting first %s PCs..." % args.numpcs) pca_scatter_plot = plot.PCAScatterPlot(pca_out, plots_number=args.numpcs, save_file=output_filename) pca_scatter_plot.draw() except Exception: logging.exception("In pca plot parser") raise
def _refactor(self): """ run refactor: exclude bad probes remove sites with low std remove covariates run feature selection computing the ReFACTor components find ranked list of the data features """ self._exclude_bad_probes() # self.meth_data.remove_missing_values_sites() # nan are not supported TODO uncomment when supported self.meth_data.remove_lowest_std_sites(self.stdth) # self.meth_data.replace_missing_values_by_mean() # nan are not supported TODO uncomment when supported # feature selection ranked_list = self._feature_selection() logging.info('Computing the ReFACTor components...') sites = ranked_list[:self.t] # take best t sites indices pca_out = pca.PCA(self.meth_data.data[sites, :].transpose()) score = pca_out.P logging.info('Saving a ranked list of the data features to %s...' % self.ranked_output_filename) ranked_list_output = [ self.meth_data.cpgnames[index] for index in ranked_list ] savetxt(self.ranked_output_filename, ranked_list_output, fmt='%s') logging.info('Saving the ReFACTor components to %s...' % self.components_output_filename) components = score[:, :self.num_components] components_output = column_stack( (self.meth_data.samples_ids, components)) savetxt(self.components_output_filename, components_output, fmt='%s') return components, ranked_list