Esempio n. 1
0
    def testContentsWithPatternsFromFolders(self):
        contents = Utilities.contentsWithPatternsFromFolder("tests/_td/dosage_set_1", ["sample", "Fail"])
        contents = {c for c in contents}
        self.assertEqual(contents, set([]))

        contents = Utilities.contentsWithPatternsFromFolder("tests/_td/dosage_set_1", ["set", "sample"])
        contents = {c for c in contents}
        self.assertEqual(contents, {"set.sample"})
def run(args):
    logging.info("Loading weight db")
    weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(args.weight_db_path)

    logging.info("Loading covariance file")
    covariance_contents = MatrixUtilities.loadMatrixFromFile(args.covariance)

    logging.info("Choosing method")
    beta_contents = Utilities.contentsWithPatternsFromFolder(args.beta_folder, [])
    zscore_calculation, normalization = MethodGuessing.chooseZscoreSchemeFromFiles(args.beta_folder, beta_contents, covariance_contents, weight_db_logic)

    logging.info("Processing")
    betas = {}
    for content in beta_contents:
        logging.info("Loading betas")
        beta_path = os.path.join(args.beta_folder, content)
        beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="")
        beta_sets = {set.name: set for set in beta_sets}
        betas[content] = beta_sets

    if args.gene_name:
        try:
            gene_data, weights, covariance_matrix, valid_rsids, beta_sets = get_gene_data(args.gene_name, weight_db_logic, covariance_contents, betas)
            weight_values, variances = ZScoreCalculation.preProcess(covariance_matrix, valid_rsids, weights, beta_sets)
            if args.interactive:
                embed()
            logging.info("Processed gene data")
        except Exception as e:
            logging.info("Couldn't get gene data")
            embed()
Esempio n. 3
0
    def processPrediXcanFiles(self):
        logging.info("Loading people")
        all_people = Person.Person.loadPeople(self.samples_input, '\t', False)
        selected_people = Person.Person.loadPeople(self.samples_output)
        selected_people_by_id = {p.id: p for p in selected_people}
        logging.info("%d total people, %d selected", len(all_people),
                     len(selected_people_by_id))

        logging.info("Loading snps")
        snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(
            self.snp_list)
        snp_dict = {k: True for k in snp_data_set.data}
        print len(snp_dict.keys())

        contents = Utilities.contentsWithPatternsFromFolder(
            self.dosage_folder, ["dosage.txt.gz"])
        for content_name in contents:
            input_path = os.path.join(self.dosage_folder, content_name)
            fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess(
                input_path, self.output_folder, content_name, all_people,
                selected_people_by_id, snp_dict)
            if self.output_format == Formats.IMPUTE:
                fileBuilder.buildIMPUTE()
            if self.output_format == Formats.PrediXcan:
                fileBuilder.buildPrediXcan()
            else:
                raise Exceptions.InvalidOutputFormat(self.output_format)
Esempio n. 4
0
    def resultsFromCovarianceFile(self, weight_db_logic):
        results = {}

        logging.info("Loading covariance file from %s", self.covariance)
        covariance_contents = MatrixUtilities.loadMatrixFromFile(self.covariance)
        #Keep only covariances present in gene models
        covariance_contents = {k:v for k,v in covariance_contents.iteritems() if k in weight_db_logic.weights_by_gene}

        beta_contents = Utilities.contentsWithPatternsFromFolder(self.folder_beta, [])
        zscore_calculation, normalization = self.selectMethod(self.folder_beta, beta_contents, covariance_contents, weight_db_logic)

        total_entries = len(weight_db_logic.genes_for_an_rsid)
        snps_found = set()
        reporter = Utilities.PercentReporter(logging.INFO, total_entries)
        for beta_name in beta_contents:
            logging.info("Processing %s", beta_name)

            beta_path = os.path.join(self.folder_beta, beta_name)

            beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="")
            beta_sets = {set.name:set for set in beta_sets }
            key, check = beta_sets.iteritems().next()
            normalization.update(beta_sets)

            for gene, entry in covariance_contents.iteritems():
                #So, new covariance files might actually have more genes than those in the database
                if not gene in weight_db_logic.weights_by_gene:
                    logging.log(8, "Gene %s not in weights", gene)
                    continue

                weights = weight_db_logic.weights_by_gene[gene]
                present = [rsid for rsid,weight in weights.iteritems() if rsid in check.values_by_key]
                if len(present) == 0:
                    logging.log(5, "No rsid in beta file for %s", gene)
                    continue

                if gene in results:
                    logging.info("Gene %s already processed", gene)
                    continue

                covariance_matrix = entry[0]
                valid_rsids = entry[1]

                logging.log(7, "Calculating z score for %s", gene)

                pre_zscore, n, VAR_g, effect_size = zscore_calculation(gene, weights, beta_sets, covariance_matrix, valid_rsids)
                results[gene] = self.buildEntry(gene, weight_db_logic, weights, pre_zscore, n, VAR_g, effect_size)

                snps_found.update(present)
                reporter.update(len(snps_found), "%d %% of model's snps found so far in the gwas study")

        #second pass, for genes not in any beta file
        self.fillBlanks(results, covariance_contents, weight_db_logic, zscore_calculation)
        normalization_constant = normalization.calculateNormalization()
        return results, normalization_constant
Esempio n. 5
0
    def resultsFromCovarianceFile(self, weight_db_logic):
        results = {}

        logging.info("Loading covariance file")
        covariance_contents = MatrixUtilities.loadMatrixFromFile(self.covariance)

        beta_contents = Utilities.contentsWithPatternsFromFolder(self.folder_beta, [])
        zscore_calculation, normalization = self.selectMethod(self.folder_beta, beta_contents, covariance_contents, weight_db_logic)

        total_entries = len(covariance_contents)
        reporter = Utilities.PercentReporter(logging.INFO, total_entries)
        i=0
        for beta_name in beta_contents:
            logging.info("Processing %s", beta_name)

            beta_path = os.path.join(self.folder_beta, beta_name)

            beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(beta_path, header="")
            beta_sets = {set.name:set for set in beta_sets }
            key, check = beta_sets.iteritems().next()
            normalization.update(beta_sets)

            for gene, entry in covariance_contents.iteritems():
                weights = weight_db_logic.weights_by_gene[gene]
                process = False
                for rsid, weight in weights.iteritems():
                    if rsid in check.values_by_key:
                        process = True
                        break

                if not process:
                    logging.log(5, "No rsid in beta file for %s", gene)
                    continue

                if gene in results:
                    logging.info("Gene %s already processed", gene)
                    continue

                reporter.update(i, "%d %% of model's snp information found so far in the gwas study") #proxied by percenteage of genes

                covariance_matrix = entry[0]
                valid_rsids = entry[1]

                logging.log(7, "Calculating z score for %s", gene)

                pre_zscore, n, VAR_g = zscore_calculation(gene, weights, beta_sets, covariance_matrix, valid_rsids)
                results[gene] = self.buildEntry(gene, weight_db_logic, weights, pre_zscore, n, VAR_g)
                i+=1

        #second pass, for genes not in any beta file
        self.fillBlanks(results, covariance_contents, weight_db_logic, zscore_calculation)
        normalization_constant = normalization.calculateNormalization()
        return results, normalization_constant
Esempio n. 6
0
    def run(self):
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db) if self.weight_db else None
        contents = Utilities.contentsWithPatternsFromFolder(self.data_folder_phase, ["gz"])

        if os.path.exists(self.output_file):
            logging.info("Correlations output already exists, delete it if you want stuff to be figured out again")
            return

        dir = os.path.dirname(self.output_file)
        if not os.path.exists(dir):
            os.makedirs(dir)

        for content in contents:
            self.buildVarianceDB(weight_db_logic,content)
Esempio n. 7
0
    def run(self):
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
            self.weight_db) if self.weight_db else None
        contents = Utilities.contentsWithPatternsFromFolder(
            self.data_folder_phase, ["gz"])

        if os.path.exists(self.output_file):
            logging.info(
                "Variance output already exists, delete it if you want stuff to be figured out again"
            )
            return

        dir = os.path.dirname(self.output_file)
        if not os.path.exists(dir):
            os.makedirs(dir)

        for content in contents:
            self.buildVarianceDB(weight_db_logic, content)
Esempio n. 8
0
    def run(self):
        logging.info("Loading weight db")
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.weight_db_path)

        logging.info("Loading covariance file")
        file = Utilities.contentsWithPatternsFromFolder(self.folder_covariance, [".gz"])[0]
        path = os.path.join(self.folder_covariance, file)
        covariance_contents = MatrixUtilities.loadMatrixFromFile(path)

        logging.info("Getting stats")
        results = []
        for gene, entry in covariance_contents.iteritems():
            covariance_matrix = entry[0]
            valid_rsids = entry[1]

            weights = weight_db_logic.weights_by_gene_name[gene]
            weight_values, variances = ZScoreCalculation.preProcess(covariance_matrix, valid_rsids, weights)

            w_w = numpy.dot(numpy.transpose(weight_values), weight_values)
            dot_product = numpy.dot(numpy.dot(numpy.transpose(weight_values), covariance_matrix), weight_values)
            det = numpy.linalg.det(covariance_matrix)

            eigenvalues, eigenvectors = numpy.linalg.eigh(covariance_matrix)
            eigenmax = numpy.amax(eigenvalues)
            eigenmin = numpy.amin(eigenvalues)
            n_small = 0
            for eigen in eigenvalues:
                if eigen < 1e-7:
                    n_small += 1
            diag = covariance_matrix.diagonal()
            mean_var = numpy.mean(diag)

            line = (gene, str(len(weight_values)), str(float(dot_product)), str(float(det)), str(float(w_w)), str(float(mean_var)), str(float(eigenmin)), str(float(eigenmax)), str(n_small))
            results.append(line)

#gene, n.snps, WW, W\Gamma W, eig(\Gamma).max, eig(\Gamma).min, #eigs<1e-8, VAR_g, zscore_g
        logging.info("saving results")
        with open(self.output_file, "w") as file:
            header = ",".join(["gene", "m_snp_count", "w_gamma_w", "det", "w_w", "mean_var", "eigenmin", "eigenmax", "n_eigen_e-7"])+"\n"
            file.write(header)
            for line in results:
                text = ",".join(line)+"\n"
                file.write(text)
Esempio n. 9
0
    def processPrediXcanFiles(self):
        logging.info("Loading people")
        all_people = Person.Person.loadPeople(self.samples_input, '\t', False)
        selected_people = Person.Person.loadPeople(self.samples_output)
        selected_people_by_id = {p.id:p for p in selected_people}
        logging.info("%d total people, %d selected", len(all_people), len(selected_people_by_id))

        logging.info("Loading snps")
        snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(self.snp_list)
        snp_dict = {k:True for k in snp_data_set.data}
        print len(snp_dict.keys())

        contents = Utilities.contentsWithPatternsFromFolder(self.dosage_folder, ["dosage.txt.gz"])
        for content_name in contents:
            input_path = os.path.join(self.dosage_folder, content_name)
            fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess(input_path, self.output_folder, content_name, all_people, selected_people_by_id, snp_dict)
            if self.output_format == Formats.IMPUTE:
                fileBuilder.buildIMPUTE()
            if self.output_format == Formats.PrediXcan:
                fileBuilder.buildPrediXcan()
            else:
                raise Exceptions.InvalidOutputFormat(self.output_format)
Esempio n. 10
0
    def resultsFromCovarianceFile(self, weight_db_logic):
        results = {}

        logging.info("Loading covariance file from %s", self.covariance)
        covariance_contents = MatrixUtilities.loadMatrixFromFile(
            self.covariance)

        beta_contents = Utilities.contentsWithPatternsFromFolder(
            self.folder_beta, [])
        zscore_calculation, normalization = self.selectMethod(
            self.folder_beta, beta_contents, covariance_contents,
            weight_db_logic)

        total_entries = len(covariance_contents)
        reporter = Utilities.PercentReporter(logging.INFO, total_entries)
        i = 0
        for beta_name in beta_contents:
            logging.info("Processing %s", beta_name)

            beta_path = os.path.join(self.folder_beta, beta_name)

            beta_sets = KeyedDataSet.KeyedDataSetFileUtilities.loadDataSetsFromCompressedFile(
                beta_path, header="")
            beta_sets = {set.name: set for set in beta_sets}
            key, check = beta_sets.iteritems().next()
            normalization.update(beta_sets)

            for gene, entry in covariance_contents.iteritems():
                #So, new covariance files might actually have more genes than those in the database
                if not gene in weight_db_logic.weights_by_gene:
                    logging.log(8, "Gene %s not in weights", gene)
                    continue

                weights = weight_db_logic.weights_by_gene[gene]
                process = False
                for rsid, weight in weights.iteritems():
                    if rsid in check.values_by_key:
                        process = True
                        break

                if not process:
                    logging.log(5, "No rsid in beta file for %s", gene)
                    continue

                if gene in results:
                    logging.info("Gene %s already processed", gene)
                    continue

                covariance_matrix = entry[0]
                valid_rsids = entry[1]

                logging.log(7, "Calculating z score for %s", gene)

                pre_zscore, n, VAR_g, effect_size = zscore_calculation(
                    gene, weights, beta_sets, covariance_matrix, valid_rsids)
                results[gene] = self.buildEntry(gene, weight_db_logic, weights,
                                                pre_zscore, n, VAR_g,
                                                effect_size)
                i += 1
                reporter.update(
                    i,
                    "%d %% of model's snp information found so far in the gwas study"
                )  # proxied by percenteage of genes

        #second pass, for genes not in any beta file
        self.fillBlanks(results, covariance_contents, weight_db_logic,
                        zscore_calculation)
        normalization_constant = normalization.calculateNormalization()
        return results, normalization_constant
WEIGHT_DB_PATH = "data/DGN-WB_0.5.db"

def loadDosageFile(path):
    callback = GWASUtilities.GWASSNPInfoLineCollector()
    dosage_loader = GWASUtilities.GWASDosageFileLoader(path, True, callback)
    keyed_data_set = dosage_loader.load()
    return keyed_data_set

logging.info("Loading weight db")
weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(WEIGHT_DB_PATH)

logging.info("Loading covariance file")
covariance_contents = MatrixUtilities.loadMatrixFromFile(COV)

logging.info("Loading betas")
beta_contents = Utilities.contentsWithPatternsFromFolder(BETA, [".gz"])
results = []
for beta_name in beta_contents:
    logging.info("Processing %s", beta_name)
    beta_path = os.path.join(BETA, beta_name)

    beta_data = loadDosageFile(beta_path)[0]
    for snp, value in beta_data.values_by_key.iteritems():
        if not snp in weight_db_logic.genes_for_an_rsid:
            logging.log(7, "rsid %s not found in DB", snp)
            continue
        genes = weight_db_logic.genes_for_an_rsid[snp]
        if not genes:
            logging.info("no gene for %s", snp)
            continue
        gene = genes[0]