def test_gtex_geno_lines_generator(self):
        data = []
        for i, line in enumerate(
                GTExGenotype.gtex_geno_lines(
                    "tests/_td/genotype/gtex_like.txt.gz",
                    "tests/_td/genotype/gtex_snp.txt.gz")):
            data.append(line)

        header = GTExGenotype.gtex_geno_header(
            "tests/_td/genotype/gtex_like.txt.gz")
        gtex_ids = header[1:]
        header = [
            "rsid", "chromosome", "position", "ref_allele", "alt_allele",
            "frequency"
        ] + gtex_ids
        dataframe = Utilities.to_dataframe(data, header, to_numeric="ignore")

        gtex_snp = pandas.read_table("tests/_td/genotype/gtex_snp.txt.gz")
        dataframe_2 = pandas.read_table("tests/_td/genotype/gtex_like.txt.gz")
        dataframe_2 = pandas.merge(dataframe_2,
                                   gtex_snp,
                                   left_on="Id",
                                   right_on="VariantID")

        compare_data_frames(dataframe, dataframe_2, gtex_ids)
 def torture_dosage(metadata, dosage, gtex_ids):
     d = [dosage[x] for x in metadata.rsid]
     d = Utilities.to_dataframe(d, gtex_ids, to_numeric="ignore")
     d["rsid"] = list(metadata.rsid)
     d = pandas.merge(metadata, d, on="rsid")
     d["number"] = list(range(0, len(d)))
     d = d.set_index("number")
     return d
Example #3
0
 def torture_dosage(metadata, dosage, gtex_ids):
     d = [dosage[x] for x in metadata.rsid]
     d = Utilities.to_dataframe(d, gtex_ids, to_numeric="ignore")
     d["rsid"] = list(metadata.rsid)
     d = pandas.merge(metadata, d, on="rsid")
     d["number"] = range(0, len(d))
     d = d.set_index("number")
     return d
Example #4
0
    def test_gtex_geno_lines_generator(self):
        data = []
        for i, line in enumerate(GTExGenotype.gtex_geno_lines("tests/_td/genotype/gtex_like.txt.gz", "tests/_td/genotype/gtex_snp.txt.gz")):
            data.append(line)

        header = GTExGenotype.gtex_geno_header("tests/_td/genotype/gtex_like.txt.gz")
        gtex_ids = header[1:]
        header = ["rsid", "chromosome", "position", "ref_allele", "alt_allele", "frequency"]+gtex_ids
        dataframe = Utilities.to_dataframe(data, header, to_numeric="ignore")


        gtex_snp = pandas.read_table("tests/_td/genotype/gtex_snp.txt.gz")
        dataframe_2 = pandas.read_table("tests/_td/genotype/gtex_like.txt.gz")
        dataframe_2 = pandas.merge(dataframe_2,gtex_snp, left_on="Id", right_on="VariantID")

        compare_data_frames(dataframe, dataframe_2, gtex_ids)
Example #5
0
def run(args):
    if os.path.exists(args.snp_covariance_output):
        logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output)
        return

    start = timer()

    logging.info("Loading models...")
    model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern)
    all_snps = model_manager.get_rsids()

    logging.info("processing genotype")
    for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps):
        logging.log(9, "Processing chromosome %s", str(chromosome))
        covariance_results = pandas.DataFrame()

        context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager)
        genes = context.get_genes()
        reporter = Utilities.PercentReporter(9, len(genes))
        reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome))
        for i,gene in enumerate(genes):
            logging.log(6, "%d/%d:%s", i+1, len(genes), gene)
            cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene)
            cov_data = MatrixManager._flatten_matrix_data([cov_data])
            cov_data = Utilities.to_dataframe(cov_data, GenotypeAnalysis.COVARIANCE_COLUMNS, to_numeric="ignore", fill_na="NA")
            covariance_results = pandas.concat([covariance_results, cov_data])

            reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome))

        reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome))

        logging.log(9, "writing chromosome results")
        Utilities.save_dataframe(covariance_results, args.snp_covariance_output,
                                    mode="w" if chromosome ==1 else "a",
                                    header=chromosome==1)

    end = timer()
    logging.info("Ran covariance builder in %s seconds" % (str(end - start)))