def parse_input_file(db_output_path, pheno_input_path, gencode_input_path, pb8k_callback): logging.info("Opening PB8K pheno file") file_iterator = Utilities.CSVFileIterator(pheno_input_path, delimiter="\t", header=TF1.HEADER, compressed=True) file_iterator.iterate(pb8k_callback) logging.info("Opening gencode file") def fixed_row(gencode, row): F = Utilities.WDBIF return (row[F.SNP], gencode.ensemble_version, row[F.GENE_NAME], row[F.REFERENCE_ALLELE], row[F.EFFECT_ALLELE], row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE]) class FixGeneCallback(object): def __init__(self, genes): self.genes = genes self.selected = {} def __call__(self, gencode): if gencode.name in self.genes: rows = self.genes[gencode.name] self.selected[gencode.name] = [fixed_row(gencode, row) for k,row in rows.iteritems()] gencode_callback = FixGeneCallback(pb8k_callback.genes) GencodeFile.parse_gencode_file(gencode_input_path, gencode_callback) genes = gencode_callback.selected logging.info("Fixing rows") genes = fix_row(genes) logging.info("Saving entries") connection = Utilities.connect(db_output_path) Utilities.setup_db(connection) Utilities.insert_entries(connection, genes)
def run(self): if os.path.exists(self.args.output_file): logging.info("DB already there, delete it if you want it done again") return connection = Utilities.connect(self.args.output_file) Utilities.setup_db(connection) parse_input_file(PB8KFileInfo.TF1, connection, self.args.input_file, self.args.gencode_file, self.args.fdr_filter, self.args.use_variance, self.args.sample_size, self.args.only_best_snp)
def run(self): if os.path.exists(self.args.output): logging.info("%s already exists, delete it if you want ity done again", self.args.output) return db = Utilities.connect(self.args.output) Utilities.setup_db(db) parse_folder(self.args.input_folder, db, self.args.gencode_file)
def process(args): logging.info("Loading LCTF %s", args.lctf_path) lctf_entries = LCFileInfo.load_lctf(args.lctf_path) logging.info("Filling snp rsid from %s", args.gtex_snp_path) lctf_entries = LCFileInfo.fill_snp_id_from_gtex(lctf_entries, args.gtex_snp_path, expect_throw=args.expect_throw) logging.info("Building db entries") #This will drop the previous stuff. weight_method= WEIGHT_METHODS[args.weight_type] name_method = NAME_METHODS[args.functional_unit_type] lctf_entries = LCFileInfo.entries_to_weight_db(lctf_entries, name_method, weight_method) logging.info("Building db") connection = Utilities.connect(args.output_file) Utilities.setup_db(connection) logging.info("Inserting entries") Utilities.insert_entries(connection, lctf_entries) logging.info("Ran successfully")
def parse_input_file(db_output_path, gtex_pheno_input_path, gencode_input_path, gtex_snp_path, gtex_callback): logging.info("Opening GTEx pheno file %s", os.path.basename(os.path.normpath(gtex_pheno_input_path))) file_iterator = Utilities.CSVFileIterator(gtex_pheno_input_path, delimiter="\t", header=GTEXEQTLF.HEADER, compressed=True) file_iterator.iterate(gtex_callback) logging.info("%d found at GTEx file", len(gtex_callback.genes)) logging.info("Opening gencode file") def gencode_fixed_row(gencode, row): F = Utilities.WDBIF return (row[F.SNP], row[F.GENE], gencode.gene_name, row[F.REFERENCE_ALLELE], row[F.EFFECT_ALLELE], row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE]) class FixGeneCallback(object): def __init__(self, genes): self.genes = genes self.selected = {} def __call__(self, gencode): if gencode.gene_id in self.genes: rows = self.genes[gencode.gene_id] self.selected[gencode.gene_id] = {k:gencode_fixed_row(gencode, row) for k,row in rows.iteritems()} gencode_callback = FixGeneCallback(gtex_callback.genes) GencodeFile.parse_gencode_file(gencode_input_path, gencode_callback) genes = gencode_callback.selected logging.info("%d survived after gencode file", len(genes)) pvalues = gtex_callback.pvalues del gencode_callback del gtex_callback logging.info("Opening GTEX Snp file") def snp_fixed_row(row, rsid, ref_allele, eff_allele): F = Utilities.WDBIF return (rsid, row[F.GENE], row[F.GENE_NAME], ref_allele, eff_allele, row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE]) class FixSNPCallback(object): def __init__(self, genes): self.genes = genes self.selected = {} self.variant_to_gene = {} for gene, rows in self.genes.iteritems(): for variant, row in rows.iteritems(): self.variant_to_gene[variant] = gene def __call__(self, i, comps): if i == 0: return F = GTExSNPFile.SNPTF variant = comps[F.VariantID] ref = comps[F.Ref_b37] eff = comps[F.Alt] snp = comps[F.RS_ID_dbSNP142_CHG37p13] if variant in self.variant_to_gene: gene = self.variant_to_gene[variant] row = self.genes[gene][variant] if not gene in self.selected: self.selected[gene] = {} self.selected[gene][snp] = snp_fixed_row(row, snp, ref, eff) snp_callback = FixSNPCallback(genes) snp_iterator = Utilities.CSVFileIterator(gtex_snp_path, delimiter="\t", compressed=True) #header in the file is just wrong., header=GTExSNPFile.SNPTF.HEADER, compressed=True) snp_iterator.iterate(snp_callback) genes = snp_callback.selected logging.info("%d survived after snp file", len(genes)) del snp_callback logging.info("Fixing rows") fix_rows(genes, pvalues) logging.info("Saving entries") connection = Utilities.connect(db_output_path) Utilities.setup_db(connection) Utilities.insert_entries(connection, genes)