def test_gtex_geno_lines_generator(self): data = [] for i, line in enumerate( GTExGenotype.gtex_geno_lines( "tests/_td/genotype/gtex_like.txt.gz", "tests/_td/genotype/gtex_snp.txt.gz")): data.append(line) header = GTExGenotype.gtex_geno_header( "tests/_td/genotype/gtex_like.txt.gz") gtex_ids = header[1:] header = [ "rsid", "chromosome", "position", "ref_allele", "alt_allele", "frequency" ] + gtex_ids dataframe = Utilities.to_dataframe(data, header, to_numeric="ignore") gtex_snp = pandas.read_table("tests/_td/genotype/gtex_snp.txt.gz") dataframe_2 = pandas.read_table("tests/_td/genotype/gtex_like.txt.gz") dataframe_2 = pandas.merge(dataframe_2, gtex_snp, left_on="Id", right_on="VariantID") compare_data_frames(dataframe, dataframe_2, gtex_ids)
def torture_dosage(metadata, dosage, gtex_ids): d = [dosage[x] for x in metadata.rsid] d = Utilities.to_dataframe(d, gtex_ids, to_numeric="ignore") d["rsid"] = list(metadata.rsid) d = pandas.merge(metadata, d, on="rsid") d["number"] = list(range(0, len(d))) d = d.set_index("number") return d
def torture_dosage(metadata, dosage, gtex_ids): d = [dosage[x] for x in metadata.rsid] d = Utilities.to_dataframe(d, gtex_ids, to_numeric="ignore") d["rsid"] = list(metadata.rsid) d = pandas.merge(metadata, d, on="rsid") d["number"] = range(0, len(d)) d = d.set_index("number") return d
def test_gtex_geno_lines_generator(self): data = [] for i, line in enumerate(GTExGenotype.gtex_geno_lines("tests/_td/genotype/gtex_like.txt.gz", "tests/_td/genotype/gtex_snp.txt.gz")): data.append(line) header = GTExGenotype.gtex_geno_header("tests/_td/genotype/gtex_like.txt.gz") gtex_ids = header[1:] header = ["rsid", "chromosome", "position", "ref_allele", "alt_allele", "frequency"]+gtex_ids dataframe = Utilities.to_dataframe(data, header, to_numeric="ignore") gtex_snp = pandas.read_table("tests/_td/genotype/gtex_snp.txt.gz") dataframe_2 = pandas.read_table("tests/_td/genotype/gtex_like.txt.gz") dataframe_2 = pandas.merge(dataframe_2,gtex_snp, left_on="Id", right_on="VariantID") compare_data_frames(dataframe, dataframe_2, gtex_ids)
def run(args): if os.path.exists(args.snp_covariance_output): logging.info("%s already exists, you have to move it or delete it if you want it done again", args.snp_covariance_output) return start = timer() logging.info("Loading models...") model_manager = PredictionModel.load_model_manager(args.models_folder, name_pattern=args.models_pattern) all_snps = model_manager.get_rsids() logging.info("processing genotype") for chromosome, metadata, dosage in GenotypeUtilities.genotype_by_chromosome_from_args(args, all_snps): logging.log(9, "Processing chromosome %s", str(chromosome)) covariance_results = pandas.DataFrame() context = GenotypeAnalysis.GenotypeAnalysisContext(metadata, dosage, model_manager) genes = context.get_genes() reporter = Utilities.PercentReporter(9, len(genes)) reporter.update(0, "%d %% of genes processed so far in chromosome " + str(chromosome)) for i,gene in enumerate(genes): logging.log(6, "%d/%d:%s", i+1, len(genes), gene) cov_data = GenotypeAnalysis.get_prediction_covariance(context, gene) cov_data = MatrixManager._flatten_matrix_data([cov_data]) cov_data = Utilities.to_dataframe(cov_data, GenotypeAnalysis.COVARIANCE_COLUMNS, to_numeric="ignore", fill_na="NA") covariance_results = pandas.concat([covariance_results, cov_data]) reporter.update(i, "%d %% of genes processed so far in chromosome "+str(chromosome)) reporter.update(len(genes), "%d %% of genes processed so far in chromosome " + str(chromosome)) logging.log(9, "writing chromosome results") Utilities.save_dataframe(covariance_results, args.snp_covariance_output, mode="w" if chromosome ==1 else "a", header=chromosome==1) end = timer() logging.info("Ran covariance builder in %s seconds" % (str(end - start)))