def readfile(self, filename): data = common.loadtxt(filename , dtype = str, delimiter=self.DELIMITER) if data.ndim != 2: common.terminate("Something wrong with the data in file %s. It is not 2D matrix." % filename) return data
def run(self, args, meth_data, output_perfix = None): output_filename = epistructure.EPISTRUCTURE_FILE_SUFFIX if output_perfix is None else output_perfix + "." + epistructure.EPISTRUCTURE_FILE_SUFFIX try: informative_sites = loadtxt(INFORMATIVE_ANCESTRY_CPG_LIST, dtype = str) informative_sites = common.loadtxt(INFORMATIVE_ANCESTRY_CPG_LIST, dtype = str) self.module = epistructure.Epistructure(meth_data, informative_sites) self.module.capture_ancestry(args.savepcs, args.covar, output_filename) return self.module.components except Exception : logging.exception("in epistructure") raise
def preprocess_sites_data(self): if self.args.include is not None: self.module.include(self.include_list) if self.args.exclude is not None: self.module.exclude(self.exclude_list) # exclude min/max values if self.args.minmean is not None: self.module.exclude_sites_with_low_mean(self.args.minmean) if self.args.maxmean is not None: self.module.exclude_sites_with_high_mean(self.args.maxmean) if self.args.minstd is not None: self.module.remove_lowest_std_sites(self.args.minstd) if self.args.rmxy: logging.info( "Searching for sites from X and Y chromosomes to exclude...") self.module.exclude(common.loadtxt(HUMAN_X_Y, dtype=str)) if self.args.rmns: logging.info("Searching for non-specific sites to exclude...") self.module.exclude(common.loadtxt(NONSPECIFIC_PROBES, dtype=str)) if self.args.rmpoly: logging.info("Searching for polymorphic sites to exclude...") self.module.exclude(common.loadtxt(POLYMORPHIC_CPGS, dtype=str))
def _load_and_validate_ids_in_file(self, fileobj, optional_ids_list, dim=1): """ validates that the file contains a vector of dimentions dim loads a vector file contianing ids list warns if there are duplicate ids in the file or if there are ids which are not found in optional_ids_list fails if this file is not a list """ if not isinstance(fileobj, file): fileobj = open(fileobj, 'r') logging.info("Loading file %s..." % fileobj.name) try: data = common.loadtxt(fileobj.name, dtype=str) except: common.terminate( "There was an error reading the file '%s', make sure you seperate the values with space, tab or comma" % (fileobj.name, dim)) if data.ndim == 0: # file contains only one item data = [data.item()] elif data.ndim == 2: if len(data[:, 0]) * len(data[0, :]) == data.size: data = data.reshape(-1, ) else: common.terminate("The file '%s' is not a %sd vector" % (fileobj.name, dim)) data_set = set(data) if len(data) != len(data_set): logging.warning( "The file %s contains some samples more than once" % fileobj.name) diff = data_set.difference(set(optional_ids_list)) if diff != set([]): logging.warning( "The file %s contains samples that were not found in the data file: %s" % (fileobj.name, diff)) return data
def run(self, args, meth_data, pheno, output_perfix, covars=None): try: kinship_data = None if type(args.kinship) == file: #kinship is provided via file logging.info("loading kinship from %s" % args.kinship.name) kinship = common.loadtxt(args.kinship, dtype=float) elif args.kinship == 'refactor': # kinship and data to test are the same # todo if --lmm provided with --refactor there is no need to run refactor twice in order to find ranked sites. logging.info("Running lmm with ReFACTor kinship...") refactor_meth_data = meth_data.copy() self.refactor.run(args, refactor_meth_data, output_perfix) logging.info( "Using best %s sites suggested by ReFACTor as the data for constructing the kinship..." % args.t) t_best_sites = self.refactor.module.ranked_sites[:args.t] data_for_kinship = meth_data.copy() data_for_kinship.include(t_best_sites) # all data is of dimensions n samplesX m sites kinship_data = data_for_kinship.data.transpose() kinship = lmm.KinshipCreator( kinship_data, is_normalized=False).create_standard_kinship() # all data is of dimensions n samplesX m sites data = meth_data.data.transpose() # data to test # initialize lmm with kinship module = lmm.LMM(kinship) logging.info('Running LMM...') t0 = time.time() if not args.oneld: # run lmm for each site so logdelta will be calculated for each site (TODO sometime move this option as an argument of LMM class and not of the parser (now, parser calls LMM class with different site each time thats patchy) logging.info( "LMM is calculating logdelta for each site; this may take a while..." ) cpgnames = [] pvalues = [] intercepts_betas = [] covars_betas = [] sites_betas = [] sigmas_g = [] sigmas_e = [] stats = [] for i in range(meth_data.sites_size): data_site_i = data[:, i].reshape( (-1, 1)) # n samples by 1 site res = module.run(data_site_i, pheno, covars, [meth_data.cpgnames[i]], args.norm, not args.re) cpgname, pvalue, intercept_beta, covariates_beta, site_beta, sigma_e, sigma_g, statistic = res cpgnames.append(cpgname[0]) pvalues.append(pvalue[0]) intercepts_betas.append(intercept_beta[0]) covars_betas.append(covariates_beta[0]) sites_betas.append(site_beta[0]) sigmas_e.append(sigma_e[0]) sigmas_g.append(sigma_g[0]) stats.append(statistic[0]) else: # run lmm on all data - logdelta is calculated once. #run lmm logging.info("Computing log delta...") lmm_results = module.run(data, pheno, covars, meth_data.cpgnames, args.norm, not args.re) cpgnames, pvalues, intercepts_betas, covars_betas, sites_betas, sigmas_e, sigmas_g, stats = lmm_results logging.debug("LMM is done in %0.2f seconds" % (time.time() - t0)) # generate result - by EWAS output format ewas_res = ewas.EWASResultsCreator("LMM", array(cpgnames), array(pvalues), statistic = array(stats),\ intercept_coefs = array(intercepts_betas), covars_coefs = array(covars_betas), \ site_coefs = array(sites_betas), sigma_g = array(sigmas_g), sigma_e = array(sigmas_e)) # save results output_file = "results" + LMM_OUT_SUFFIX if output_perfix is None else output_perfix + LMM_OUT_SUFFIX ewas_res.save(output_file) return ewas_res except Exception: logging.exception("in lmm") raise