Ejemplo n.º 1
0
    def readfile(self, filename):

        data = common.loadtxt(filename , dtype = str, delimiter=self.DELIMITER)

        if data.ndim != 2:
            common.terminate("Something wrong with the data in file %s. It is not 2D matrix." % filename)

        return data
Ejemplo n.º 2
0
 def run(self, args, meth_data, output_perfix = None):
     output_filename = epistructure.EPISTRUCTURE_FILE_SUFFIX if output_perfix is None else output_perfix + "." +  epistructure.EPISTRUCTURE_FILE_SUFFIX
     try:
         informative_sites = loadtxt(INFORMATIVE_ANCESTRY_CPG_LIST, dtype = str)
         informative_sites = common.loadtxt(INFORMATIVE_ANCESTRY_CPG_LIST, dtype = str)
         self.module = epistructure.Epistructure(meth_data, informative_sites)
         self.module.capture_ancestry(args.savepcs, args.covar, output_filename)
         return self.module.components
     except Exception :
         logging.exception("in epistructure")
         raise
Ejemplo n.º 3
0
    def preprocess_sites_data(self):
        if self.args.include is not None:
            self.module.include(self.include_list)
        if self.args.exclude is not None:
            self.module.exclude(self.exclude_list)
        # exclude min/max values
        if self.args.minmean is not None:
            self.module.exclude_sites_with_low_mean(self.args.minmean)
        if self.args.maxmean is not None:
            self.module.exclude_sites_with_high_mean(self.args.maxmean)
        if self.args.minstd is not None:
            self.module.remove_lowest_std_sites(self.args.minstd)

        if self.args.rmxy:
            logging.info(
                "Searching for sites from X and Y chromosomes to exclude...")
            self.module.exclude(common.loadtxt(HUMAN_X_Y, dtype=str))
        if self.args.rmns:
            logging.info("Searching for non-specific sites to exclude...")
            self.module.exclude(common.loadtxt(NONSPECIFIC_PROBES, dtype=str))
        if self.args.rmpoly:
            logging.info("Searching for polymorphic sites to exclude...")
            self.module.exclude(common.loadtxt(POLYMORPHIC_CPGS, dtype=str))
Ejemplo n.º 4
0
    def _load_and_validate_ids_in_file(self,
                                       fileobj,
                                       optional_ids_list,
                                       dim=1):
        """
        validates that the file contains a vector of dimentions dim
        loads a vector file contianing ids list
        warns if there are duplicate ids in the file or if there are ids which are not found in optional_ids_list
        fails if this file is not a list
        """
        if not isinstance(fileobj, file):
            fileobj = open(fileobj, 'r')

        logging.info("Loading file %s..." % fileobj.name)
        try:
            data = common.loadtxt(fileobj.name, dtype=str)
        except:
            common.terminate(
                "There was an error reading the file '%s', make sure you seperate the values with space, tab or comma"
                % (fileobj.name, dim))

        if data.ndim == 0:  # file contains only one item
            data = [data.item()]
        elif data.ndim == 2:
            if len(data[:, 0]) * len(data[0, :]) == data.size:
                data = data.reshape(-1, )
            else:
                common.terminate("The file '%s' is not a %sd vector" %
                                 (fileobj.name, dim))

        data_set = set(data)
        if len(data) != len(data_set):
            logging.warning(
                "The file %s contains some samples more than once" %
                fileobj.name)

        diff = data_set.difference(set(optional_ids_list))
        if diff != set([]):
            logging.warning(
                "The file %s contains samples that were not found in the data file: %s"
                % (fileobj.name, diff))

        return data
Ejemplo n.º 5
0
    def run(self, args, meth_data, pheno, output_perfix, covars=None):
        try:
            kinship_data = None

            if type(args.kinship) == file:  #kinship is provided via file
                logging.info("loading kinship from %s" % args.kinship.name)
                kinship = common.loadtxt(args.kinship, dtype=float)

            elif args.kinship == 'refactor':  # kinship and data to test are the same
                # todo if --lmm provided with --refactor there is no need to run refactor twice in order to find ranked sites.
                logging.info("Running lmm with ReFACTor kinship...")
                refactor_meth_data = meth_data.copy()
                self.refactor.run(args, refactor_meth_data, output_perfix)

                logging.info(
                    "Using best %s sites suggested by ReFACTor as the data for constructing the kinship..."
                    % args.t)
                t_best_sites = self.refactor.module.ranked_sites[:args.t]

                data_for_kinship = meth_data.copy()
                data_for_kinship.include(t_best_sites)

                # all data is of dimensions n samplesX m sites
                kinship_data = data_for_kinship.data.transpose()
                kinship = lmm.KinshipCreator(
                    kinship_data,
                    is_normalized=False).create_standard_kinship()

            # all data is of dimensions n samplesX m sites
            data = meth_data.data.transpose()  # data to test

            # initialize lmm with kinship
            module = lmm.LMM(kinship)
            logging.info('Running LMM...')

            t0 = time.time()
            if not args.oneld:  # run lmm for each site so logdelta will be calculated for each site (TODO sometime move this option as an argument of LMM class and not of the parser (now, parser calls LMM class with different site each time thats patchy)
                logging.info(
                    "LMM is calculating logdelta for each site; this may take a while..."
                )
                cpgnames = []
                pvalues = []
                intercepts_betas = []
                covars_betas = []
                sites_betas = []
                sigmas_g = []
                sigmas_e = []
                stats = []

                for i in range(meth_data.sites_size):
                    data_site_i = data[:, i].reshape(
                        (-1, 1))  # n samples by 1 site
                    res = module.run(data_site_i, pheno, covars,
                                     [meth_data.cpgnames[i]], args.norm,
                                     not args.re)
                    cpgname, pvalue, intercept_beta, covariates_beta, site_beta, sigma_e, sigma_g, statistic = res

                    cpgnames.append(cpgname[0])
                    pvalues.append(pvalue[0])
                    intercepts_betas.append(intercept_beta[0])
                    covars_betas.append(covariates_beta[0])
                    sites_betas.append(site_beta[0])
                    sigmas_e.append(sigma_e[0])
                    sigmas_g.append(sigma_g[0])
                    stats.append(statistic[0])

            else:  # run lmm on all data - logdelta is calculated once.
                #run lmm
                logging.info("Computing log delta...")
                lmm_results = module.run(data, pheno, covars,
                                         meth_data.cpgnames, args.norm,
                                         not args.re)
                cpgnames, pvalues, intercepts_betas, covars_betas, sites_betas, sigmas_e, sigmas_g, stats = lmm_results

            logging.debug("LMM is done in %0.2f seconds" % (time.time() - t0))

            # generate result - by EWAS output format
            ewas_res = ewas.EWASResultsCreator("LMM", array(cpgnames), array(pvalues), statistic = array(stats),\
                                              intercept_coefs = array(intercepts_betas), covars_coefs = array(covars_betas), \
                                              site_coefs = array(sites_betas), sigma_g = array(sigmas_g), sigma_e = array(sigmas_e))

            # save results
            output_file = "results" + LMM_OUT_SUFFIX if output_perfix is None else output_perfix + LMM_OUT_SUFFIX
            ewas_res.save(output_file)

            return ewas_res

        except Exception:
            logging.exception("in lmm")
            raise