Esempio n. 1
0
    def capture_ancestry(self,
                         num_of_pcs=2,
                         covars_to_regress=None,
                         save_file=None):
        logging.info("Running EPISTRUCTURE...")
        logging.info("Removing non-informative sites...")
        self.meth_data.include(self.informative_sites)

        covars = self.meth_data.get_covariates_subset(covars_to_regress)
        if covars is not None:
            logging.info("Regressing out covariates...")
            self.meth_data.regress_out(covars)
        else:
            logging.info("Ignoring covariates...")

        logging.info("Running PCA...")
        pca_out = pca.PCA(self.meth_data.data.transpose(
        ))  # meth_data should be transposed before passing to pca

        if save_file:
            output_filename = save_file
            pcs = pca_out.P[:, range(num_of_pcs)]
            data_to_save = column_stack((self.meth_data.samples_ids, pcs))
            fmt = '%-12s' + '\t%-12s' * num_of_pcs
            savetxt(output_filename, data_to_save,
                    fmt=fmt)  # saves it as samples X PCs
            logging.info("The first %s PCs were saved to %s." %
                         (num_of_pcs, output_filename))

        self.components = pcs
Esempio n. 2
0
def low_rank_approximation(O, k):
    """
    O dimensions are n X m
    """
    import time
    a = time.time()
    pca_out = pca.PCA(O)
    b = time.time()
    res = dot(pca_out.P[:, 0:k], pca_out.U[:, 0:k].transpose())
    c = time.time()
    logging.debug("PCA TOOK %s SECONDS AND DOT(MULTI) TOOK %s SECONDS" %
                  (b - a, c - b))
    return res
Esempio n. 3
0
    def __init__(self):
        logging.info("Testing Started on PCATester")
        pca_res_p = loadtxt(self.PCA_P_RES)

        meth_data = methylation_data.MethylationDataLoader(
            datafile=self.DATA_FILE)
        pca_out = pca.PCA(meth_data.data.transpose())

        for i in range(10):
            assert tools.correlation(pca_out.P[:, i], pca_res_p[:, i])

        logging.info("PASS")
        logging.info("Testing Finished on PCATester")
Esempio n. 4
0
    def exclude_maxpcstds(self, pcstds):
        """
        pcstds is a list of lists (or tuples) where the first index is the pc_index and the second index is the std_num
        exclude samples that have std above std_num times higer or lower than the pc std on every std_index
        
        for example, let pcstds be [(1,3),(5,4)] - that will exclude samples that have std above 3 or below -3 on pc 1 and above 4 or below -4 in pc 5 
        """
        pca_out = pca.PCA(self.data.transpose())

        maxpcstds_samples_indices = set()
        for pc_index, std_num in pcstds:
            logging.info("Finding samples with standard deviation (STD) higher than %d STDs or lower than %d STDs in principal component number %d..." % (std_num, -1 * std_num, pc_index))
            pc = pca_out.P[:,pc_index-1] # user start counting from 1 and python from 0
            std_pc = std(pc)
            maxpcstds_samples_indices.update(where((pc > std_num * std_pc) | (pc < -1 * std_num * std_pc))[0])

        if maxpcstds_samples_indices:
            logging.info("Excluding samples according to STDs...")
            self.remove_samples_indices(list(maxpcstds_samples_indices))
Esempio n. 5
0
    def run(self, args, meth_data):
        # run pca and plot PCs
        output_filename = args.out if args.out else self.SCATTER_OUTPUT_FILE

        try:
            assert args.numpcs + 1 < meth_data.samples_size

            logging.info("Running PCA...")
            pca_out = pca.PCA(meth_data.data.transpose(
            ))  # meth_data should be transposed before passing to pca

            logging.info("Plotting first %s PCs..." % args.numpcs)
            pca_scatter_plot = plot.PCAScatterPlot(pca_out,
                                                   plots_number=args.numpcs,
                                                   save_file=output_filename)
            pca_scatter_plot.draw()

        except Exception:
            logging.exception("In pca plot parser")
            raise
Esempio n. 6
0
    def _refactor(self):
        """
        run refactor:
            exclude bad probes
            remove sites with low std
            remove covariates
            run feature selection
            computing the ReFACTor components
            find ranked list of the data features
        """
        self._exclude_bad_probes()
        # self.meth_data.remove_missing_values_sites() # nan are not supported TODO uncomment when supported
        self.meth_data.remove_lowest_std_sites(self.stdth)
        # self.meth_data.replace_missing_values_by_mean() # nan are not supported TODO uncomment when supported

        # feature selection
        ranked_list = self._feature_selection()
        logging.info('Computing the ReFACTor components...')
        sites = ranked_list[:self.t]  # take best t sites indices
        pca_out = pca.PCA(self.meth_data.data[sites, :].transpose())
        score = pca_out.P

        logging.info('Saving a ranked list of the data features to %s...' %
                     self.ranked_output_filename)
        ranked_list_output = [
            self.meth_data.cpgnames[index] for index in ranked_list
        ]
        savetxt(self.ranked_output_filename, ranked_list_output, fmt='%s')

        logging.info('Saving the ReFACTor components to %s...' %
                     self.components_output_filename)
        components = score[:, :self.num_components]
        components_output = column_stack(
            (self.meth_data.samples_ids, components))
        savetxt(self.components_output_filename, components_output, fmt='%s')

        return components, ranked_list