Esempio n. 1
0
def run(args):
    class Callback(object):
        def __init__(self):
            self.gene_type = {}
            self.transcript_type = {}

        def __call__(self, gencode):
            if not gencode.gene_type in self.gene_type:
                self.gene_type[gencode.gene_type] = {"count":0}
            self.gene_type[gencode.gene_type]["count"] = self.gene_type[gencode.gene_type]["count"] + 1

            if not gencode.transcript_type in self.transcript_type:
                self.transcript_type[gencode.transcript_type] = {"count":0}
            self.transcript_type[gencode.transcript_type]["count"] = self.transcript_type[gencode.transcript_type]["count"] + 1

    if os.path.exists(args.results_file):
        logging.info("Output file already exists.")
        exit(0)

    input_lines = read_input_table(args.input_table)

    callback = Callback()
    GencodeFile.parse_gencode_file(args.gencode_file, callback, only_genes=False)

    folder = os.path.split(args.results_file)[0]
    if len(folder) and not os.path.exists(folder):
        os.makedirs(folder)

    save_results(args.results_file, callback, input_lines)
Esempio n. 2
0
def parse_input_file(db_output_path, pheno_input_path, gencode_input_path,  pb8k_callback):
    logging.info("Opening PB8K pheno file")
    file_iterator = Utilities.CSVFileIterator(pheno_input_path, delimiter="\t", header=TF1.HEADER, compressed=True)
    file_iterator.iterate(pb8k_callback)

    logging.info("Opening gencode file")
    def fixed_row(gencode, row):
        F = Utilities.WDBIF
        return (row[F.SNP], gencode.ensemble_version, row[F.GENE_NAME], row[F.REFERENCE_ALLELE], row[F.EFFECT_ALLELE], row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE])

    class FixGeneCallback(object):
        def __init__(self, genes):
            self.genes = genes
            self.selected = {}

        def __call__(self, gencode):
            if gencode.name in self.genes:
                rows = self.genes[gencode.name]
                self.selected[gencode.name] = [fixed_row(gencode, row) for k,row in rows.iteritems()]

    gencode_callback = FixGeneCallback(pb8k_callback.genes)
    GencodeFile.parse_gencode_file(gencode_input_path, gencode_callback)
    genes = gencode_callback.selected

    logging.info("Fixing rows")
    genes = fix_row(genes)

    logging.info("Saving entries")
    connection = Utilities.connect(db_output_path)
    Utilities.setup_db(connection)
    Utilities.insert_entries(connection, genes)
Esempio n. 3
0
def parse_input_file(TF, connection, input_file, gencode_file, fdr_filter=None, use_variance=None, sample_size=None, only_best_snp=None):
    genes = {}
    logging.info("Opening pheno phile")
    with gzip.open(input_file) as file:
        for i,line in enumerate(file):
            if i==0:
                continue

            comps = line.strip().split()

            if fdr_filter:
                fdr = comps[TF.FDR]
                if float(fdr) > fdr_filter:
                    snp = comps[TF.SNPName]
                    logging.log(9,"Snp %s doesn't pass fdr filter: %s", snp, fdr)
                    continue

            gene = comps[TF.HUGO]
            if "," in gene:
                multiple_genes = gene.split(",")
                for g in multiple_genes:
                    row = row_from_comps(g, comps, TF)
                    process_row(g, row, genes, only_best_snp)
            else:
                row = row_from_comps(gene, comps, TF)
                process_row(gene, row, genes, only_best_snp)

    logging.info("Opening gencode file")

    class GenCodeCallback(object):
        def __init__(self, genes):
            self.genes = genes
            self.selected = {}

        def __call__(self, gencode):
            if gencode.name in self.genes:
                rows = self.genes[gencode.name]
                self.selected[gencode.name] = [(row[0], gencode.ensemble_version, row[1], row[2], row[3], row[4]) for row in rows]
    callback = GenCodeCallback(genes)
    GencodeFile.parse_gencode_file(gencode_file, callback)
    genes = callback.selected

    if use_variance:
        logging.info("Opening variance file")
        vars = VarianceFile.load_variance(use_variance)
        keys = genes.keys()
        for key in keys:
            rows = genes[key]
            new_rows = []
            for r in rows:
                snp = r[0]
                if not snp in vars:
                    continue
                v = vars[snp]
                std = math.sqrt(v/sample_size)
                new_rows.append([r[0], r[1], r[2], str(float(r[3])*std), r[4], r[5]])
            genes[key] = new_rows

    Utilities.insert_entries(connection, genes)
Esempio n. 4
0
    def run(self):
        if os.path.exists(self.args.output):
            logging.info("%s already exists. Delete it if you want it done again", self.args.output)
            return

        logging.info("Loading %s", self.args.weight_db)
        weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(self.args.weight_db)
        logging.info("Loaded %s genes", len(weight_db_logic.gene_data_for_gene))

        logging.info("Building snp dict from %s", self.args.gtex_snp)
        snp_dict = GTExSNPFile.build_snp_dict(self.args.gtex_snp, weight_db_logic)
        logging.info("Got %d snps in dictionary", len(snp_dict))

        logging.info("Building gene expression")
        gene_expression, individuals = GTExGenoFile.build_gene_expression(self.args.gtex_geno, weight_db_logic, snp_dict)
        logging.info("Loaded %d gene expression", len(gene_expression))

        if self.args.gencode_file:
            logging.info("Translating gene names to ensemble id")
            ensemble_to_name, name_to_ensemble = GencodeFile.ensemble_to_name_relationships(self.args.gencode_file)
            logging.info("Loaded %d (%d) names", len(ensemble_to_name), len(name_to_ensemble))
            keys = gene_expression.keys()
            for k in keys:
                expression = gene_expression[k]
                if k in ensemble_to_name:
                    pass
                elif k in name_to_ensemble:
                    del gene_expression[k]
                    ensemble_id = name_to_ensemble[k]
                    gene_expression[ensemble_id] = expression
                else:
                    del gene_expression[k]

        logging.info("Saving gene expression for %d genes", len(gene_expression))
        save_expression(self.args.output, gene_expression, individuals)
Esempio n. 5
0
def parse_folder(folder, db, gencode_file):

    contents = os.listdir(folder)

    logging.info("Processing gencode file")
    class GencodeCallback(object):
        def __init__(self, contents):
            self.contents = {gene:True for gene in contents}
            self.genes = {}
        def __call__(self, gencode):
            if gencode.name in self.contents:
                self.genes[gencode.name] = gencode.ensemble_version
    callback = GencodeCallback(contents)
    GencodeFile.parse_gencode_file(gencode_file, callback)
    gene_names = callback.genes

    logging.info("processing folder")
    genes = {}
    for content in contents:
        if not content in gene_names:
            logging.log(9, "Gene %s not in gencode", content)
            continue

        sub_path = TWASFormat.build_subpaths(folder, content)

        map_path = sub_path + ".wgt.map"
        snps = TWASFormat.load_map(map_path)

        weights = TWASFormat.build_weights(sub_path)

        rows = []
        gene_id = gene_names[content]
        for i, snp in enumerate(snps):
            w = weights[i]
            row = (snp[TWASFormat.MTF.snp], gene_id, content, w, snp[TWASFormat.MTF.a1], snp[TWASFormat.MTF.a2] )
            rows.append(row)
        genes[content] = rows

    Utilities.insert_entries(db, genes)
Esempio n. 6
0
def parse_input_file(db_output_path, gtex_pheno_input_path, gencode_input_path, gtex_snp_path, gtex_callback):
    logging.info("Opening GTEx pheno file %s", os.path.basename(os.path.normpath(gtex_pheno_input_path)))
    file_iterator = Utilities.CSVFileIterator(gtex_pheno_input_path, delimiter="\t", header=GTEXEQTLF.HEADER, compressed=True)
    file_iterator.iterate(gtex_callback)
    logging.info("%d found at GTEx file", len(gtex_callback.genes))

    logging.info("Opening gencode file")
    def gencode_fixed_row(gencode, row):
        F = Utilities.WDBIF
        return (row[F.SNP], row[F.GENE], gencode.gene_name, row[F.REFERENCE_ALLELE], row[F.EFFECT_ALLELE], row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE])

    class FixGeneCallback(object):
        def __init__(self, genes):
            self.genes = genes
            self.selected = {}

        def __call__(self, gencode):
            if gencode.gene_id in self.genes:
                rows = self.genes[gencode.gene_id]
                self.selected[gencode.gene_id] = {k:gencode_fixed_row(gencode, row) for k,row in rows.iteritems()}

    gencode_callback = FixGeneCallback(gtex_callback.genes)
    GencodeFile.parse_gencode_file(gencode_input_path, gencode_callback)
    genes = gencode_callback.selected
    logging.info("%d survived after gencode file", len(genes))
    pvalues = gtex_callback.pvalues
    del gencode_callback
    del gtex_callback

    logging.info("Opening GTEX Snp file")
    def snp_fixed_row(row, rsid, ref_allele, eff_allele):
        F = Utilities.WDBIF
        return (rsid, row[F.GENE], row[F.GENE_NAME], ref_allele, eff_allele, row[F.WEIGHT], row[F.N_SNP], row[F.GENE_R2], row[F.GENE_PVALUE], row[F.GENE_QVALUE])

    class FixSNPCallback(object):
        def __init__(self, genes):
            self.genes = genes
            self.selected = {}
            self.variant_to_gene = {}
            for gene, rows in self.genes.iteritems():
                for variant, row in rows.iteritems():
                    self.variant_to_gene[variant] = gene

        def __call__(self, i, comps):
            if i == 0:
                return
            F = GTExSNPFile.SNPTF
            variant = comps[F.VariantID]
            ref = comps[F.Ref_b37]
            eff = comps[F.Alt]
            snp = comps[F.RS_ID_dbSNP142_CHG37p13]
            if variant in self.variant_to_gene:
                gene = self.variant_to_gene[variant]
                row = self.genes[gene][variant]
                if not gene in self.selected:
                    self.selected[gene] = {}
                self.selected[gene][snp] = snp_fixed_row(row, snp, ref, eff)

    snp_callback = FixSNPCallback(genes)
    snp_iterator = Utilities.CSVFileIterator(gtex_snp_path, delimiter="\t", compressed=True) #header in the file is just wrong., header=GTExSNPFile.SNPTF.HEADER, compressed=True)
    snp_iterator.iterate(snp_callback)
    genes = snp_callback.selected
    logging.info("%d survived after snp file", len(genes))
    del snp_callback

    logging.info("Fixing rows")
    fix_rows(genes, pvalues)

    logging.info("Saving entries")
    connection = Utilities.connect(db_output_path)
    Utilities.setup_db(connection)
    Utilities.insert_entries(connection, genes)