예제 #1
0
def parse_input_file(db_output_path, pheno_input_path, gencode_input_path,  pb8k_callback):
    logging.info("Opening PB8K pheno file")
    file_iterator = Utilities.CSVFileIterator(pheno_input_path, delimiter="\t", header=TF1.HEADER, compressed=True)
    file_iterator.iterate(pb8k_callback)

    logging.info("Opening gencode file")
    def fixed_row(gencode, row):
        F = Utilities.WDBIF
        return (row[F.SNP], gencode.ensemble_version, row[F.GENE_NAME], row[F.REFERENCE_ALLELE], row[F.EFFECT_ALLELE], row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE])

    class FixGeneCallback(object):
        def __init__(self, genes):
            self.genes = genes
            self.selected = {}

        def __call__(self, gencode):
            if gencode.name in self.genes:
                rows = self.genes[gencode.name]
                self.selected[gencode.name] = [fixed_row(gencode, row) for k,row in rows.iteritems()]

    gencode_callback = FixGeneCallback(pb8k_callback.genes)
    GencodeFile.parse_gencode_file(gencode_input_path, gencode_callback)
    genes = gencode_callback.selected

    logging.info("Fixing rows")
    genes = fix_row(genes)

    logging.info("Saving entries")
    connection = Utilities.connect(db_output_path)
    Utilities.setup_db(connection)
    Utilities.insert_entries(connection, genes)
예제 #2
0
    def run(self):
        if os.path.exists(self.args.output_file):
            logging.info("DB already there, delete it if you want it done again")
            return

        connection = Utilities.connect(self.args.output_file)

        Utilities.setup_db(connection)
        parse_input_file(PB8KFileInfo.TF1, connection, self.args.input_file, self.args.gencode_file, self.args.fdr_filter,
                         self.args.use_variance, self.args.sample_size, self.args.only_best_snp)
예제 #3
0
    def run(self):
        if os.path.exists(self.args.output):
            logging.info("%s already exists, delete it if you want ity done again", self.args.output)
            return

        db = Utilities.connect(self.args.output)

        Utilities.setup_db(db)

        parse_folder(self.args.input_folder, db, self.args.gencode_file)
예제 #4
0
def process(args):
    logging.info("Loading LCTF %s", args.lctf_path)
    lctf_entries = LCFileInfo.load_lctf(args.lctf_path)

    logging.info("Filling snp rsid from %s", args.gtex_snp_path)
    lctf_entries = LCFileInfo.fill_snp_id_from_gtex(lctf_entries, args.gtex_snp_path, expect_throw=args.expect_throw)

    logging.info("Building db entries")
    #This will drop the previous stuff.
    weight_method= WEIGHT_METHODS[args.weight_type]
    name_method = NAME_METHODS[args.functional_unit_type]
    lctf_entries = LCFileInfo.entries_to_weight_db(lctf_entries, name_method, weight_method)

    logging.info("Building db")
    connection = Utilities.connect(args.output_file)

    Utilities.setup_db(connection)

    logging.info("Inserting entries")
    Utilities.insert_entries(connection, lctf_entries)

    logging.info("Ran successfully")
예제 #5
0
def parse_input_file(db_output_path, gtex_pheno_input_path, gencode_input_path, gtex_snp_path, gtex_callback):
    logging.info("Opening GTEx pheno file %s", os.path.basename(os.path.normpath(gtex_pheno_input_path)))
    file_iterator = Utilities.CSVFileIterator(gtex_pheno_input_path, delimiter="\t", header=GTEXEQTLF.HEADER, compressed=True)
    file_iterator.iterate(gtex_callback)
    logging.info("%d found at GTEx file", len(gtex_callback.genes))

    logging.info("Opening gencode file")
    def gencode_fixed_row(gencode, row):
        F = Utilities.WDBIF
        return (row[F.SNP], row[F.GENE], gencode.gene_name, row[F.REFERENCE_ALLELE], row[F.EFFECT_ALLELE], row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE])

    class FixGeneCallback(object):
        def __init__(self, genes):
            self.genes = genes
            self.selected = {}

        def __call__(self, gencode):
            if gencode.gene_id in self.genes:
                rows = self.genes[gencode.gene_id]
                self.selected[gencode.gene_id] = {k:gencode_fixed_row(gencode, row) for k,row in rows.iteritems()}

    gencode_callback = FixGeneCallback(gtex_callback.genes)
    GencodeFile.parse_gencode_file(gencode_input_path, gencode_callback)
    genes = gencode_callback.selected
    logging.info("%d survived after gencode file", len(genes))
    pvalues = gtex_callback.pvalues
    del gencode_callback
    del gtex_callback

    logging.info("Opening GTEX Snp file")
    def snp_fixed_row(row, rsid, ref_allele, eff_allele):
        F = Utilities.WDBIF
        return (rsid, row[F.GENE], row[F.GENE_NAME], ref_allele, eff_allele, row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE])

    class FixSNPCallback(object):
        def __init__(self, genes):
            self.genes = genes
            self.selected = {}
            self.variant_to_gene = {}
            for gene, rows in self.genes.iteritems():
                for variant, row in rows.iteritems():
                    self.variant_to_gene[variant] = gene

        def __call__(self, i, comps):
            if i == 0:
                return
            F = GTExSNPFile.SNPTF
            variant = comps[F.VariantID]
            ref = comps[F.Ref_b37]
            eff = comps[F.Alt]
            snp = comps[F.RS_ID_dbSNP142_CHG37p13]
            if variant in self.variant_to_gene:
                gene = self.variant_to_gene[variant]
                row = self.genes[gene][variant]
                if not gene in self.selected:
                    self.selected[gene] = {}
                self.selected[gene][snp] = snp_fixed_row(row, snp, ref, eff)

    snp_callback = FixSNPCallback(genes)
    snp_iterator = Utilities.CSVFileIterator(gtex_snp_path, delimiter="\t", compressed=True) #header in the file is just wrong., header=GTExSNPFile.SNPTF.HEADER, compressed=True)
    snp_iterator.iterate(snp_callback)
    genes = snp_callback.selected
    logging.info("%d survived after snp file", len(genes))
    del snp_callback

    logging.info("Fixing rows")
    fix_rows(genes, pvalues)

    logging.info("Saving entries")
    connection = Utilities.connect(db_output_path)
    Utilities.setup_db(connection)
    Utilities.insert_entries(connection, genes)