Beispiel #1
0
def get_anno_files():
    config = read_gemini_config()
    anno_dirname = config["annotation_dir"]
    return {
     'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'),
    'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'),
    'dbsnp': os.path.join(anno_dirname, 'dbsnp.137.vcf.gz'),
    'clinvar': os.path.join(anno_dirname, 'clinvar_20130118.vcf.gz'),
    'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'),
    'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'),
    'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'),
    'conserved': os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'),
    'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'),
    'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'),
    'esp': os.path.join(anno_dirname,
                        'ESP6500SI.all.snps_indels.vcf.gz'),
    '1000g': os.path.join(anno_dirname,
                          'ALL.wgs.integrated_phase1_v3.20101123.snps_indels_sv.sites.2012Oct12.vcf.gz'),
    'recomb': os.path.join(anno_dirname,
                           'genetic_map_HapMapII_GRCh37.gz'),
    'gms': os.path.join(anno_dirname,
                        'GRCh37-gms-mappability.vcf.gz'),
    'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'),
    'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"),
    'encode_tfbs': os.path.join(anno_dirname,
                                'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'),
    'encode_dnase1': os.path.join(anno_dirname,
                                  'stam.125cells.dnaseI.hg19.bed.gz'),
    'encode_consensus_segs': os.path.join(anno_dirname,
                                          'encode.6celltypes.consensus.bedg.gz'),
    'gerp_bp': os.path.join(anno_dirname, 'hg19.gerp.bw'),
    'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'),
    }
Beispiel #2
0
def install_annotation_files(anno_root_dir):
    """Download required annotation files.
    """
    # create the full gemini data path based on
    # the root dir the user provided
    if anno_root_dir.endswith(("gemini", "gemini/")):
        anno_dir = os.path.join(anno_root_dir, "data")
    else:
        anno_dir = os.path.join(anno_root_dir, "gemini", "data")
    if not os.path.exists(anno_dir):
        os.makedirs(anno_dir)

    cur_config = read_gemini_config(allow_missing=True)

    # download and install each of the annotation files
    for orig in anno_files:
        if orig.endswith(".gz"):
            dls = [orig, "%s.tbi" % orig]
        else:
            dls = [orig]
        for dl in dls:
            url = "http://people.virginia.edu/~arq5x/files/gemini/annotations/{fname}".format(fname=dl)
            _download_to_dir(url, anno_dir, anno_versions.get(orig, 1),
                             cur_config.get("versions", {}).get(orig, 1))

    cur_config["annotation_dir"] = anno_dir
    cur_config["versions"] = anno_versions
    write_gemini_config(cur_config)
Beispiel #3
0
    def _get_gene_detailed(self):
        """
        define a gene detailed table
        """
        #unique identifier for each entry
        i = 0
        table_contents = detailed_list = []

        config = read_gemini_config(args=self.args)
        path_dirname = config["annotation_dir"]
        file_handle = os.path.join(path_dirname, 'detailed_gene_table_v75')

        for line in open(file_handle, 'r'):
            field = line.strip().split("\t")
            if not field[0].startswith("Chromosome"):
                i += 1
                table = gene_table.gene_detailed(field)
                detailed_list = [str(i),table.chrom,table.gene,table.is_hgnc,
                                 table.ensembl_gene_id,table.ensembl_trans_id,
                                 table.biotype,table.trans_status,table.ccds_id,
                                 table.hgnc_id,table.entrez,table.cds_length,table.protein_length,
                                 table.transcript_start,table.transcript_end,
                                 table.strand,table.synonym,table.rvis,table.mam_phenotype]
                table_contents.append(detailed_list)
        database.insert_gene_detailed(self.c, table_contents)
Beispiel #4
0
    def _get_gene_summary(self):
        """
        define a gene summary table
        """
        #unique identifier for each entry
        i = 0
        contents = summary_list = []

        config = read_gemini_config(args=self.args)
        path_dirname = config["annotation_dir"]
        file = os.path.join(path_dirname, 'summary_gene_table_v75')

        for line in open(file, 'r'):
            col = line.strip().split("\t")
            if not col[0].startswith("Chromosome"):
                i += 1
                table = gene_table.gene_summary(col)
                # defaul cosmic census to False
                cosmic_census = 0
                summary_list = [str(i),table.chrom,table.gene,table.is_hgnc,
                                table.ensembl_gene_id,table.hgnc_id,
                                table.transcript_min_start,
                                table.transcript_max_end,table.strand,
                                table.synonym,table.rvis,table.mam_phenotype,
                                cosmic_census]
                contents.append(summary_list)
        database.insert_gene_summary(self.c, contents)
Beispiel #5
0
def install_annotation_files(anno_root_dir, dl_files=False, extra=None):
    """Download required annotation files.
    """
    # create the full gemini data path based on
    # the root dir the user provided
    if anno_root_dir.endswith(("gemini/data", "gemini/data/", "gemini_data")):
        anno_dir = anno_root_dir
    elif anno_root_dir.endswith(("gemini", "gemini/")):
        anno_dir = os.path.join(anno_root_dir, "data")
    else:
        anno_dir = os.path.join(anno_root_dir, "gemini", "data")

    cur_config = read_gemini_config(allow_missing=True)
    cur_config["annotation_dir"] = os.path.abspath(anno_dir)
    cur_config["versions"] = anno_versions
    write_gemini_config(cur_config)

    if dl_files:
        if not os.path.exists(anno_dir):
            os.makedirs(anno_dir)
        if not os.path.isdir(anno_dir):
            sys.exit(anno_dir + " is not a valid directory.")
        _check_dependencies()
        to_dl = anno_files[:]
        if extra:
            to_dl += [extra_anno_files[x] for x in extra]
        _download_anno_files("https://s3.amazonaws.com/gemini-annotations",
                             to_dl, anno_dir, cur_config)
Beispiel #6
0
def get_gemini_files(data):
    """Enumerate available gemini data files in a standard installation.
    """
    try:
        from gemini import annotations, config
    except ImportError:
        return {}
    return {"base": config.read_gemini_config()["annotation_dir"], "files": annotations.get_anno_files().values()}
Beispiel #7
0
def get_gemini_files(data):
    """Enumerate available gemini data files in a standard installation.
    """
    try:
        from gemini import annotations, config
    except ImportError:
        return {}
    return {"base": config.read_gemini_config()["annotation_dir"],
            "files": annotations.get_anno_files().values()}
Beispiel #8
0
def get_anno_files():
    config = read_gemini_config()
    anno_dirname = config["annotation_dir"]
    return {
        'pfam_domain':
        os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'),
        'cytoband':
        os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'),
        'dbsnp':
        os.path.join(anno_dirname, 'dbsnp.137.vcf.gz'),
        'clinvar':
        os.path.join(anno_dirname, 'clinvar_20130118.vcf.gz'),
        'gwas':
        os.path.join(anno_dirname, 'hg19.gwas.bed.gz'),
        'rmsk':
        os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'),
        'segdup':
        os.path.join(anno_dirname, 'hg19.segdup.bed.gz'),
        'conserved':
        os.path.join(
            anno_dirname,
            '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'
        ),
        'cpg_island':
        os.path.join(anno_dirname, 'hg19.CpG.bed.gz'),
        'dgv':
        os.path.join(anno_dirname, 'hg19.dgv.bed.gz'),
        'esp':
        os.path.join(anno_dirname, 'ESP6500SI.all.snps_indels.vcf.gz'),
        '1000g':
        os.path.join(
            anno_dirname,
            'ALL.wgs.integrated_phase1_v3.20101123.snps_indels_sv.sites.2012Oct12.vcf.gz'
        ),
        'recomb':
        os.path.join(anno_dirname, 'genetic_map_HapMapII_GRCh37.gz'),
        'gms':
        os.path.join(anno_dirname, 'GRCh37-gms-mappability.vcf.gz'),
        'grc':
        os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'),
        'cse':
        os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"),
        'encode_tfbs':
        os.path.join(anno_dirname,
                     'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'),
        'encode_dnase1':
        os.path.join(anno_dirname, 'stam.125cells.dnaseI.hg19.bed.gz'),
        'encode_consensus_segs':
        os.path.join(anno_dirname, 'encode.6celltypes.consensus.bedg.gz'),
        'gerp_bp':
        os.path.join(anno_dirname, 'hg19.gerp.bw'),
        'gerp_elements':
        os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'),
    }
Beispiel #9
0
def get_pathways(args):

    version_dic = defaultdict()
    version_dic = {
        '66': 'kegg_pathways_ensembl66',
        '67': 'kegg_pathways_ensembl67',
        '68': 'kegg_pathways_ensembl68',
        '69': 'kegg_pathways_ensembl69',
        '70': 'kegg_pathways_ensembl70',
        '71': 'kegg_pathways_ensembl71'
    }

    config = read_gemini_config(args=args)
    path_dirname = config["annotation_dir"]
    if args.version in version_dic:
        path_file = os.path.join(path_dirname, version_dic[args.version])

    else:
        sys.exit("Unsupported Ensembl gene version.\n")

    agn_paths = defaultdict(list)
    hgnc_paths = defaultdict(list)
    ensembl_paths = defaultdict(list)

    for line in open(path_file, 'r'):
        fields = line.strip().split("\t")
        uniprot = fields[0]
        agn = fields[1]
        hgnc = fields[2]
        ensid = fields[3]
        ens_transcript = fields[4]
        hsa = fields[5]
        path = fields[6] if fields[6] != 'None' else None

        # clean up the pathways such that this:
        # path:hsa00260;Glycine_serine_and_threonine_metabolism
        # becomes this:
        # hsa00260:Glycine_serine_and_threonine_metabolism
        if path is not None and path.startswith("path:"):
            path = path[5:]
            path = path.replace(";", ":")

        # build gene/transcript -> pathway mappings using
        # all three gene naming conventions
        agn_paths[(agn, ens_transcript)].append(path)
        hgnc_paths[(hgnc, ens_transcript)].append(path)
        ensembl_paths[(ensid, ens_transcript)].append(path)

    return agn_paths, hgnc_paths, ensembl_paths
Beispiel #10
0
    def _get_gene_detailed(self):
        """
        define a gene detailed table
        """
        #unique identifier for each entry
        i = 0
        table_contents = detailed_list = []
        

        config = read_gemini_config()
        path_dirname = config["annotation_dir"]
        file_handle = os.path.join(path_dirname, 'detailed_gene_table_v75')
        

        header= ['uid','chrom','gene','is_hgnc','ensembl_gene_id','transcript','biotype','transcript_status','ccds_id','hgnc_id',\
        'entrez_id','cds_length','protein_length','transcript_start','transcript_end','strand','synonym','rvis_pct','mam_phenotype_id']


        import csv
        with open('../gene_detailed.csv', 'wb') as csvfile:
            rowwriter = csv.writer(csvfile, delimiter=',',
                                    quotechar='|', quoting=csv.QUOTE_MINIMAL)
            rowwriter.writerow(header)
            

            for line in open(file_handle, 'r'):
                field = line.strip().split("\t")
                if not field[0].startswith("Chromosome"):
                    i += 1
                    table = gene_table.gene_detailed(field)
                    detailed_list = [str(i),table.chrom,table.gene,table.is_hgnc,
                                     table.ensembl_gene_id,table.ensembl_trans_id, 
                                     table.biotype,table.trans_status,table.ccds_id, 
                                     table.hgnc_id,table.entrez,table.cds_length,table.protein_length, 
                                     table.transcript_start,table.transcript_end,
                                     table.strand,table.synonym,table.rvis,table.mam_phenotype]
                    rowwriter.writerow(detailed_list)
                    # if(i==5):
                    #     print detailed_list
                    # table_contents.append(detailed_list)
            #database.insert_gene_detailed(self.c, table_contents)
        


        """
Beispiel #11
0
def install_annotation_files(anno_root_dir):
    """Download required annotation files.
    """
    # create the full gemini data path based on
    # the root dir the user provided
    if anno_root_dir.endswith(("gemini/data", "gemini/data/")):
        anno_dir = anno_root_dir
    elif anno_root_dir.endswith(("gemini", "gemini/")):
        anno_dir = os.path.join(anno_root_dir, "data")
    else:
        anno_dir = os.path.join(anno_root_dir, "gemini", "data")
    if not os.path.exists(anno_dir):
        os.makedirs(anno_dir)

    cur_config = read_gemini_config(allow_missing=True)

    _download_anno_files("https://s3.amazonaws.com/gemini-annotations",
                         anno_files, anno_dir, cur_config)
Beispiel #12
0
def update_cosmic_census_genes(cursor, args):
    """
    Update the gene summary table with
    whether or not a given gene is in the
    COSMIC cancer gene census
    """
    config = read_gemini_config(args=args)
    path_dirname = config["annotation_dir"]
    file = os.path.join(path_dirname, 'cancer_gene_census.20140120.tsv')

    cosmic_census_genes = []
    for line in open(file, 'r'):
        fields = line.strip().split("\t")
        gene = fields[0]
        chrom = "chr" + fields[3]
        cosmic_census_genes.append((1, gene, chrom))

    database.update_gene_summary_w_cancer_census(cursor, cosmic_census_genes)
Beispiel #13
0
def install_annotation_files(anno_root_dir):
    """Download required annotation files.
    """
    # create the full gemini data path based on
    # the root dir the user provided
    if anno_root_dir.endswith(("gemini/data", "gemini/data/")):
        anno_dir = anno_root_dir
    elif anno_root_dir.endswith(("gemini", "gemini/")):
        anno_dir = os.path.join(anno_root_dir, "data")
    else:
        anno_dir = os.path.join(anno_root_dir, "gemini", "data")
    if not os.path.exists(anno_dir):
        os.makedirs(anno_dir)

    cur_config = read_gemini_config(allow_missing=True)

    _download_anno_files("https://s3.amazonaws.com/gemini-annotations",
                         anno_files, anno_dir, cur_config)
Beispiel #14
0
def update_cosmic_census_genes(session, metadata, args):
    """
    Update the gene summary table with
    whether or not a given gene is in the
    COSMIC cancer gene census
    """
    config = read_gemini_config(args=args)
    path_dirname = config["annotation_dir"]
    file = os.path.join(path_dirname, 'cancer_gene_census.20140120.tsv')

    cosmic_census_genes = []
    for line in open(file, 'r'):
        fields = line.strip().split("\t")
        gene = fields[0]
        chrom = "chr" + fields[3]
        cosmic_census_genes.append((1, gene, chrom))

    database.update_gene_summary_w_cancer_census(session, metadata, cosmic_census_genes)
Beispiel #15
0
def get_anno_files( args ):
    config = read_gemini_config( args = args )
    anno_dirname = config["annotation_dir"]
    # Default annotations -- always found
    annos = {
     'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'),
     'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'),
     'dbsnp': os.path.join(anno_dirname, 'dbsnp.b141.20140813.hg19.tidy.vcf.gz'),
     'clinvar': os.path.join(anno_dirname, 'clinvar_20150305.tidy.vcf.gz'),
     'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'),
     'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'),
     'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'),
     'conserved': os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'),
     'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'),
     'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'),
     'esp': os.path.join(anno_dirname,
                         'ESP6500SI.all.snps_indels.tidy.v2.vcf.gz'),
     '1000g': os.path.join(anno_dirname,
                           'ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.tidy.vcf.gz'),
     'recomb': os.path.join(anno_dirname,
                            'genetic_map_HapMapII_GRCh37.gz'),
     'gms': os.path.join(anno_dirname,
                         'GRCh37-gms-mappability.vcf.gz'),
     'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'),
     'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"),
     'encode_tfbs': os.path.join(anno_dirname,
                                 'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'),
     'encode_dnase1': os.path.join(anno_dirname,
                                   'stam.125cells.dnaseI.hg19.bed.gz'),
     'encode_consensus_segs': os.path.join(anno_dirname,
                                           'encode.6celltypes.consensus.bedg.gz'),
     'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'),
     'vista_enhancers': os.path.join(anno_dirname, 'hg19.vista.enhancers.20131108.bed.gz'),
     'fitcons': os.path.join(anno_dirname, "hg19_fitcons_fc-i6-0_V1-01.bed.gz"),
     'cosmic': os.path.join(anno_dirname, 'cosmic-v68-GRCh37.tidy.vcf.gz'),
     'exac': os.path.join(anno_dirname, 'ExAC.r0.3.sites.vep.tidy.vcf.gz'),
     'geno2mp': os.path.join(anno_dirname, 'geno2mp.variants.tidy.vcf.gz'),
    }
    # optional annotations
    if os.path.exists(os.path.join(anno_dirname, 'hg19.gerp.bw')):
        annos['gerp_bp'] = os.path.join(anno_dirname, 'hg19.gerp.bw')
    if os.path.exists(os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')):
        annos['cadd_score'] = os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')
    return annos
Beispiel #16
0
def get_pathways(args):

    version_dic = defaultdict()
    version_dic = {'66': 'kegg_pathways_ensembl66', '67': 'kegg_pathways_ensembl67',
                   '68': 'kegg_pathways_ensembl68', '69': 'kegg_pathways_ensembl69',
                   '70': 'kegg_pathways_ensembl70', '71': 'kegg_pathways_ensembl71'}

    config = read_gemini_config( args = args )
    path_dirname = config["annotation_dir"]
    if args.version in version_dic:
        path_file = os.path.join(path_dirname, version_dic[args.version])

    else:
        raise NotImplementedError("Unsupported Ensembl gene version.\n")

    agn_paths = defaultdict(list)
    hgnc_paths = defaultdict(list)
    ensembl_paths = defaultdict(list)

    for line in open(path_file, 'r'):
        fields=line.strip().split("\t")
        uniprot = fields[0]
        agn = fields[1]
        hgnc = fields[2]
        ensid = fields[3]
        ens_transcript = fields[4]
        hsa = fields[5]
        path = fields[6] if fields[6] != 'None' else None

        # clean up the pathways such that this:
        # path:hsa00260;Glycine_serine_and_threonine_metabolism
        # becomes this:
        # hsa00260:Glycine_serine_and_threonine_metabolism
        if path is not None and path.startswith("path:"):
            path = path[5:]
            path = path.replace(";", ":")

        # build gene/transcript -> pathway mappings using
        # all three gene naming conventions
        agn_paths[(agn, ens_transcript)].append(path)
        hgnc_paths[(hgnc, ens_transcript)].append(path)
        ensembl_paths[(ensid, ens_transcript)].append(path)

    return agn_paths, hgnc_paths, ensembl_paths
Beispiel #17
0
def get_anno_files(args):
    config = read_gemini_config(args=args)
    anno_dirname = config["annotation_dir"]
    # Default annotations -- always found
    annos = {
     'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'),
     'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'),
     'dbsnp': os.path.join(anno_dirname, 'dbsnp.b141.20140813.hg19.tidy.vcf.gz'),
     'clinvar': os.path.join(anno_dirname, 'clinvar_20160203.tidy.vcf.gz'),
     'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'),
     'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'),
     'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'),
     'conserved': os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'),
     'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'),
     'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'),
     'esp': os.path.join(anno_dirname,
                         'ESP6500SI.all.snps_indels.tidy.v2.vcf.gz'),
     '1000g': os.path.join(anno_dirname,
                           'ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.tidy.vcf.gz'),
     'recomb': os.path.join(anno_dirname,
                            'genetic_map_HapMapII_GRCh37.gz'),
     'gms': os.path.join(anno_dirname,
                         'GRCh37-gms-mappability.vcf.gz'),
     'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'),
     'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"),
     'encode_tfbs': os.path.join(anno_dirname,
                                 'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'),
     'encode_dnase1': os.path.join(anno_dirname,
                                   'stam.125cells.dnaseI.hg19.bed.gz'),
     'encode_consensus_segs': os.path.join(anno_dirname,
                                           'encode.6celltypes.consensus.bedg.gz'),
     'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'),
     'vista_enhancers': os.path.join(anno_dirname, 'hg19.vista.enhancers.20131108.bed.gz'),
     'fitcons': os.path.join(anno_dirname, "hg19_fitcons_fc-i6-0_V1-01.bed.gz"),
     'cosmic': os.path.join(anno_dirname, 'cosmic-v68-GRCh37.tidy.vcf.gz'),
     'exac': os.path.join(anno_dirname, 'ExAC.r0.3.sites.vep.tidy.vcf.gz'),
     'geno2mp': os.path.join(anno_dirname, 'geno2mp.variants.tidy.vcf.gz'),
    }
    # optional annotations
    if os.path.exists(os.path.join(anno_dirname, 'hg19.gerp.bw')):
        annos['gerp_bp'] = os.path.join(anno_dirname, 'hg19.gerp.bw')
    if os.path.exists(os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')):
        annos['cadd_score'] = os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')
    return annos
Beispiel #18
0
def load_annos():
    """
    Populate a dictionary of Tabixfile handles for
    each annotation file.  Other modules can then
    access a given handle and fetch data from it
    as follows:
    
    dbsnp_handle = annotations.annos['dbsnp']
    hits = dbsnp_handle.fetch(chrom, start, end)
    """
    config = read_gemini_config()
    anno_dirname = config["annotation_dir"]
    anno_files   = {
        'cytoband'     : os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'),
        'dbsnp'        : os.path.join(anno_dirname, 'dbsnp.137.vcf.gz'),
        'gwas'         : os.path.join(anno_dirname, 'hg19.gwas.bed.gz'),
        'rmsk'         : os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'),
        'segdup'       : os.path.join(anno_dirname, 'hg19.segdup.bed.gz'),
        'conserved'    : os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'),
        'cpg_island'   : os.path.join(anno_dirname, 'hg19.CpG.bed.gz'),
        'dgv'          : os.path.join(anno_dirname, 'hg19.dgv.bed.gz'),
        'esp'          : os.path.join(anno_dirname, \
                                      'ESP6500SI.all.snps_indels.vcf.gz'),
        '1000g'        : os.path.join(anno_dirname, \
 'ALL.wgs.integrated_phase1_v3.20101123.snps_indels_sv.sites.2012Oct12.vcf.gz'),
        'recomb'       : os.path.join(anno_dirname, \
                         'genetic_map_HapMapII_GRCh37.gz'),
        'gms'          : os.path.join(anno_dirname, \
                         'GRCh37-gms-mappability.vcf.gz'),
        'grc'          : os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'),
        'encode_tfbs'          : os.path.join(anno_dirname, \
                                'wgEncodeRegTfbsClusteredV2.cell_count.bed.gz'),
        'encode_consensus_segs': os.path.join(anno_dirname, \
                                'encode.6celltypes.consensus.bedg.gz'),
        'encode_segway_segs'   : os.path.join(anno_dirname, \
                                'encode.6celltypes.segway.bedg.gz'),
        'encode_chromhmm_segs' : os.path.join(anno_dirname, \
                                'encode.6celltypes.chromhmm.bedg.gz')
    }

    for anno in anno_files:
        annos[anno] = pysam.Tabixfile(anno_files[anno])
Beispiel #19
0
def get_anno_files():
    config = read_gemini_config()
    anno_dirname = config["annotation_dir"]
    # Default annotations -- always found
    annos = {
     'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'),
     'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'),
     'dbsnp': os.path.join(anno_dirname, 'dbsnp.138.vcf.gz'),
     'clinvar': os.path.join(anno_dirname, 'clinvar_20140303.vcf.gz'),
     'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'),
     'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'),
     'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'),
     'conserved': os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'),
     'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'),
     'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'),
     'esp': os.path.join(anno_dirname,
                         'ESP6500SI.all.snps_indels.vcf.gz'),
     '1000g': os.path.join(anno_dirname,
                           'ALL.wgs.integrated_phase1_v3.20101123.snps_indels_sv.sites.2012Oct12.vcf.gz'),
     'recomb': os.path.join(anno_dirname,
                            'genetic_map_HapMapII_GRCh37.gz'),
     'gms': os.path.join(anno_dirname,
                         'GRCh37-gms-mappability.vcf.gz'),
     'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'),
     'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"),
     'encode_tfbs': os.path.join(anno_dirname,
                                 'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'),
     'encode_dnase1': os.path.join(anno_dirname,
                                   'stam.125cells.dnaseI.hg19.bed.gz'),
     'encode_consensus_segs': os.path.join(anno_dirname,
                                           'encode.6celltypes.consensus.bedg.gz'),
     'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'),
     'vista_enhancers': os.path.join(anno_dirname, 'hg19.vista.enhancers.20131108.bed.gz'),
     'cosmic': os.path.join(anno_dirname, 'hg19.cosmic.v67.20131024.gz')
    }
    # optional annotations
    if os.path.exists(os.path.join(anno_dirname, 'hg19.gerp.bw')):
        annos['gerp_bp'] = os.path.join(anno_dirname, 'hg19.gerp.bw')
    if os.path.exists(os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')):
        annos['cadd_score'] = os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')
    return annos
Beispiel #20
0
def install_annotation_files(anno_root_dir):
    """Download required annotation files.
    """
    # create the full gemini data path based on
    # the root dir the user provided
    if anno_root_dir.endswith(("gemini", "gemini/")):
        anno_dir = os.path.join(anno_root_dir, "data")
    else:
        anno_dir = os.path.join(anno_root_dir, "gemini", "data")
    if not os.path.exists(anno_dir):
        os.makedirs(anno_dir)

    cur_config = read_gemini_config(allow_missing=True)
    cur_config["annotation_dir"] = anno_dir
    write_gemini_config(cur_config)

    # download and install each of the annotation files
    for dl in anno_files:
        url = "http://people.virginia.edu/~arq5x/files/gemini/annotations/{fname}".format(
            fname=dl)
        _download_to_dir(url, anno_dir)
Beispiel #21
0
def sample_gene_interactions(c, args, idx_to_sample):
    out = open("file.dot", 'w')
    #fetch variant gene dict for all samples
    samples = get_variant_genes(c, args, idx_to_sample)
    #file handle for fetching the hprd graph
    config = read_gemini_config()
    path_dirname = config["annotation_dir"]
    file_graph = os.path.join(path_dirname, 'hprd_interaction_graph')
    #load the graph using cPickle and close file handle
    gr = graph()
    f = open(file_graph, 'rb')
    gr = cPickle.load(f)
    f.close()
    k = []
    variants = []
    #calculate nodes from the graph
    hprd_genes = gr.nodes()
    if args.gene == None or args.gene not in hprd_genes:
        sys.stderr.write("Gene name not found or")
        sys.stderr.write(" gene not in p-p interaction file\n")

    elif args.gene in hprd_genes:
        x, y = \
            breadth_first_search(gr,root=args.gene,filter=radius(args.radius))
        gst = digraph()
        gst.add_spanning_tree(x)
        dot = write(gst)
        out.write(dot)
        st, sd = shortest_path(gst, args.gene)

        if args.var_mode:
            for sample in samples.iterkeys():
                var = samples[str(sample)]
                #for each level return interacting genes if they are
                # variants in the sample.
                # 0th order would be returned if the user chosen
                # gene is a variant in the sample
                for x in range(0, (args.radius+1)):
                    for each in var:
                        for key, value in sd.iteritems():
                            if value == x and key == each[0]:
                                print "\t".join([str(sample),str(args.gene), \
                                          str(x), \
                                          str(key), \
                                          str(each[1]), \
                                          str(each[2]), \
                                          str(each[3]), \
                                          str(each[4]), \
                                          str(each[5]), \
                                          str(each[6]), \
                                          str(each[7]), \
                                          str(each[8]), \
                                          str(each[9]), \
                                          str(each[10]), \
                                          str(each[11])])
        elif (not args.var_mode):
            for sample in samples.iterkeys():
                for each in samples[str(sample)]:
                    variants.append(each[0])
                for x in range(0, (args.radius+1)):
                    for key, value in sd.iteritems():
                        if value == x and key in set(variants):
                            k.append(key)
                    if k:
                        print "\t".join([str(sample), str(args.gene), \
                                 str(x)+"_order:",
                                 ",".join(k)])
                    else:
                        print "\t".join([str(sample), str(args.gene), \
                                         str(x)+"_order:", "none"])
                    #initialize keys for next iteration
                    k = []
                #initialize variants list for next iteration
                variants = []
Beispiel #22
0
#!/usr/bin/env python

import os
import sys
import sqlite3
import numpy as np
import cPickle
import zlib
from collections import defaultdict
from gemini.config import read_gemini_config
import gemini_utils as util
from gemini_constants import *

config = read_gemini_config()
path_dirname = config["annotation_dir"]


def get_pathways(args):

    if args.version == None or args.version == '66':
        path_file = os.path.join(path_dirname, 'kegg_pathways_ensembl66') 
    elif args.version == '67':
        path_file = os.path.join(path_dirname, 'kegg_pathways_ensembl67')
    elif args.version == '68':
        path_file = os.path.join(path_dirname, 'kegg_pathways_ensembl68')
    else:
        sys.exit("Unsupported Ensembl gene version.\n")

    agn_paths = defaultdict(list)
    hgnc_paths = defaultdict(list)
    ensembl_paths = defaultdict(list)
Beispiel #23
0
def sample_lof_interactions(res, args, idx_to_sample, samples):
    lof = get_lof_genes(res, args, idx_to_sample)
    if args.edges is None:
        config = read_gemini_config(args=args)
        path_dirname = config["annotation_dir"]
        file_graph = os.path.join(path_dirname, 'hprd_interaction_edges.gz')
    else:
        file_graph = args.edges

    gr = nx.DiGraph()
    for e in xopen(file_graph):
        pair = e.strip().split("|")
        gr.add_edge(*pair)

    #initialize keys
    k = []
    variants = []

    if (not args.var_mode):
        for sample in lof.iterkeys():
            lofvariants = list(set(lof[str(sample)]))
            for each in samples[str(sample)]:
                variants.append(each[0])
            for gene in lofvariants:
                if gene not in gr: continue
                genes = nx.single_source_shortest_path_length(
                    gr, gene, cutoff=args.radius)
                for rad in range(1, (args.radius + 1)):
                    for key, value in genes.iteritems():
                        if (value == rad) and key in set(variants):
                            k.append(key)
                    if k:
                        print "\t".join([
                            str(sample),
                            str(gene),
                            str(rad) + "_order:", ",".join(k)
                        ])
                    else:
                        print "\t".join([
                            str(sample),
                            str(gene),
                            str(rad) + "_order:", "none"
                        ])
                    #initialize k
                    k = []
            #initialize variants list for next iteration
            variants = []
    elif args.var_mode:
        for sample in lof.iterkeys():
            lofvariants = list(set(lof[str(sample)]))
            var = samples[str(sample)]
            for gene in lofvariants:
                if not gene in gr: continue
                genes = nx.single_source_shortest_path_length(
                    gr, gene, cutoff=args.radius)

                for rad in range(1, (args.radius + 1)):
                    for each in var:
                        for key, value in genes.iteritems():
                            if value == rad and key == each[0]:
                                print "\t".join([
                                    str(sample),
                                    str(gene),
                                    str(rad),
                                    str(key),
                                    str(each[1]),
                                    str(each[2]),
                                    str(each[3]),
                                    str(each[4]),
                                    str(each[5]),
                                    str(each[6]),
                                    str(each[7]),
                                    str(each[8]),
                                    str(each[9]),
                                    str(each[10]),
                                    str(each[11])
                                ])
Beispiel #24
0
def sample_gene_interactions(res, args, idx_to_sample):
    # fetch variant gene dict for all samples
    samples = get_variant_genes(res, args, idx_to_sample)
    # file handle for fetching the hprd graph
    if args.edges is None:
        config = read_gemini_config(args=args)
        path_dirname = config["annotation_dir"]
        file_graph = os.path.join(path_dirname, 'hprd_interaction_edges.gz')
    else:
        file_graph = args.edges

    gr = nx.DiGraph()
    for e in xopen(file_graph):
        pair = e.strip().split("|")
        gr.add_edge(*pair)

    k = []
    variants = []
    #calculate nodes from the graph
    if args.gene is None or args.gene not in gr:
        sys.stderr.write("Gene name not found or")
        sys.stderr.write(" gene not in interaction file\n")

    elif args.gene in gr:
        genes = nx.single_source_shortest_path_length(gr,
                                                      args.gene,
                                                      cutoff=args.radius)

        if args.var_mode:
            for sample in samples.iterkeys():
                var = samples[str(sample)]
                #for each level return interacting genes if they are
                # variants in the sample.
                # 0th order would be returned if the user chosen
                # gene is a variant in the sample
                for radius in range(0, (args.radius + 1)):
                    for each in var:
                        for key, dist in genes.iteritems():
                            if dist == radius and key == each[0]:
                                print "\t".join([str(sample), str(args.gene), \
                                          str(radius), \
                                          str(key), \
                                          str(each[1]), \
                                          str(each[2]), \
                                          str(each[3]), \
                                          str(each[4]), \
                                          str(each[5]), \
                                          str(each[6]), \
                                          str(each[7]), \
                                          str(each[8]), \
                                          str(each[9]), \
                                          str(each[10]), \
                                          str(each[11])])
        elif (not args.var_mode):
            for sample in samples.iterkeys():
                for each in samples[str(sample)]:
                    variants.append(each[0])
                for x in range(0, (args.radius + 1)):
                    for key, value in genes.iteritems():
                        if value == x and key in set(variants):
                            k.append(key)
                    if k:
                        print "\t".join([str(sample), str(args.gene), \
                                 str(x)+"_order:",
                                 ",".join(k)])
                    else:
                        print "\t".join([str(sample), str(args.gene), \
                                         str(x)+"_order:", "none"])
                    #initialize keys for next iteration
                    k = []
                #initialize variants list for next iteration
                variants = []
Beispiel #25
0
def load_annos():
    """
    Populate a dictionary of Tabixfile handles for
    each annotation file.  Other modules can then
    access a given handle and fetch data from it
    as follows:
    
    dbsnp_handle = annotations.annos['dbsnp']
    hits = dbsnp_handle.fetch(chrom, start, end)
    """
    config = read_gemini_config()
    anno_dirname = config["annotation_dir"]
    anno_files   = {
                    'cytoband'     : \
                    os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'),

                    'dbsnp'        : \
                    os.path.join(anno_dirname, 'dbsnp.135.vcf.gz'),

                    'gwas'         : \
                    os.path.join(anno_dirname, 'hg19.gwas.bed.gz'),

                    'rmsk'         : \
                    os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'),

                    'segdup'       : \
                    os.path.join(anno_dirname, 'hg19.segdup.bed.gz'),

                    'conserved'    : \
                    os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'),
                    'cpg_island'   : \
                    os.path.join(anno_dirname, 'hg19.CpG.bed.gz'),

                    'dgv'          : \
                    os.path.join(anno_dirname, 'hg19.dgv.bed.gz'),

                    'esp'          : \
                    os.path.join(anno_dirname, 'ESP5400.all.snps.vcf.gz'),

                    '1000g'        : \
                    os.path.join(anno_dirname, 'ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz'),

                    'recomb'       : \
                    os.path.join(anno_dirname, 'genetic_map_HapMapII_GRCh37.gz'),

                    'gms'          : \
                    os.path.join(anno_dirname, 'GRCh37-gms-mappability.vcf.gz'),

                    'grc'          : \
                    os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'),

                    'encode_tfbs'  : \
                    os.path.join(anno_dirname, 'wgEncodeRegTfbsClusteredV2.cell_count.bed.gz'),

                    'encode_consensus_segs'  : \
                    os.path.join(anno_dirname, \
                    'encode.6celltypes.consensus.bedg.gz'),

                    'encode_segway_segs'  : \
                    os.path.join(anno_dirname, \
                    'encode.6celltypes.segway.bedg.gz'),

                    'encode_chromhmm_segs'  : \
                    os.path.join(anno_dirname, \
                    'encode.6celltypes.chromhmm.bedg.gz')
                   }

    for anno in anno_files:
        annos[anno] = pysam.Tabixfile(anno_files[anno])
Beispiel #26
0
#!/usr/bin/env python

import os
import sys
import re
import sqlite3
import numpy as np
import cPickle
import zlib
from collections import defaultdict
from gemini.config import read_gemini_config
import gemini_utils as util
from gemini_constants import *


config = read_gemini_config()
path_dirname = config["annotation_dir"]


def get_ind_lof(c, args):

    idx_to_sample = util.map_indicies_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             v.impact, v.aa_change, v.aa_length, \
                             v.gt_types, v.gts, i.gene, \
                             i.transcript,  i.biotype\
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1' \
             AND v.type = 'snp'"
Beispiel #27
0
def sample_gene_interactions(c, args, idx_to_sample):
    out = open("file.dot", 'w')
    #fetch variant gene dict for all samples
    samples = get_variant_genes(c, args, idx_to_sample)
    #file handle for fetching the hprd graph
    config = read_gemini_config(args=args)
    path_dirname = config["annotation_dir"]
    file_graph = os.path.join(path_dirname, 'hprd_interaction_graph')
    #load the graph using cPickle and close file handle
    gr = graph()
    f = open(file_graph, 'rb')
    gr = cPickle.load(f)
    f.close()
    k = []
    variants = []
    #calculate nodes from the graph
    hprd_genes = gr.nodes()
    if args.gene == None or args.gene not in hprd_genes:
        sys.stderr.write("Gene name not found or")
        sys.stderr.write(" gene not in p-p interaction file\n")

    elif args.gene in hprd_genes:
        x, y = \
            breadth_first_search(gr,root=args.gene,filter=radius(args.radius))
        gst = digraph()
        gst.add_spanning_tree(x)
        dot = write(gst)
        out.write(dot)
        st, sd = shortest_path(gst, args.gene)

        if args.var_mode:
            for sample in samples.iterkeys():
                var = samples[str(sample)]
                #for each level return interacting genes if they are
                # variants in the sample.
                # 0th order would be returned if the user chosen
                # gene is a variant in the sample
                for x in range(0, (args.radius + 1)):
                    for each in var:
                        for key, value in sd.iteritems():
                            if value == x and key == each[0]:
                                print "\t".join([str(sample),str(args.gene), \
                                          str(x), \
                                          str(key), \
                                          str(each[1]), \
                                          str(each[2]), \
                                          str(each[3]), \
                                          str(each[4]), \
                                          str(each[5]), \
                                          str(each[6]), \
                                          str(each[7]), \
                                          str(each[8]), \
                                          str(each[9]), \
                                          str(each[10]), \
                                          str(each[11])])
        elif (not args.var_mode):
            for sample in samples.iterkeys():
                for each in samples[str(sample)]:
                    variants.append(each[0])
                for x in range(0, (args.radius + 1)):
                    for key, value in sd.iteritems():
                        if value == x and key in set(variants):
                            k.append(key)
                    if k:
                        print "\t".join([str(sample), str(args.gene), \
                                 str(x)+"_order:",
                                 ",".join(k)])
                    else:
                        print "\t".join([str(sample), str(args.gene), \
                                         str(x)+"_order:", "none"])
                    #initialize keys for next iteration
                    k = []
                #initialize variants list for next iteration
                variants = []
Beispiel #28
0
def sample_lof_interactions(c, args, idx_to_sample, samples):
    lof = get_lof_genes(c, args, idx_to_sample)
    #file handle for fetching the hprd graph
    config = read_gemini_config(args=args)
    path_dirname = config["annotation_dir"]
    file_graph = os.path.join(path_dirname, 'hprd_interaction_graph')
    #load the graph using cPickle and close file handle
    gr = graph()
    f = open(file_graph, 'rb')
    gr = cPickle.load(f)
    f.close()
    #calculate nodes from the graph
    hprd_genes = gr.nodes()
    #initialize keys
    k = []
    variants = []

    if (not args.var_mode):
        for sample in lof.iterkeys():
            lofvariants = list(set(lof[str(sample)]))
            for each in samples[str(sample)]:
                variants.append(each[0])
            for gene in lofvariants:
                if gene in hprd_genes:
                    x, y = \
                        breadth_first_search(gr,root=gene,\
                        filter=radius(args.radius))

                    gst = digraph()
                    gst.add_spanning_tree(x)
                    st, sd = shortest_path(gst, gene)
                    # for each level return interacting genes
                    # if they are variants in the sample.
                    for rad in range(1, (args.radius + 1)):
                        for key, value in sd.iteritems():
                            if (value == rad) and key in set(variants):
                                k.append(key)
                        if k:
                            print "\t".join([str(sample), \
                                       str(gene), \
                                       str(rad)+"_order:",
                                       ",".join(k)])
                        else:
                            print "\t".join([str(sample), \
                                       str(gene), \
                                       str(rad)+"_order:", \
                                       "none"])
                        #initialize k
                        k = []
            #initialize variants list for next iteration
            variants = []
    elif args.var_mode:
        for sample in lof.iterkeys():
            lofvariants = list(set(lof[str(sample)]))
            var = samples[str(sample)]
            for gene in lofvariants:
                if gene in hprd_genes:
                    x, y = \
                         breadth_first_search(gr,root=gene, \
                         filter=radius(args.radius))
                    gst = digraph()
                    gst.add_spanning_tree(x)
                    st, sd = shortest_path(gst, gene)
                    for rad in range(1, (args.radius + 1)):
                        for each in var:
                            for key, value in sd.iteritems():
                                if value == rad and key == each[0]:
                                    print "\t".join([str(sample), \
                                               str(gene), \
                                               str(rad), \
                                               str(key), \
                                               str(each[1]), \
                                               str(each[2]), \
                                               str(each[3]), \
                                               str(each[4]), \
                                               str(each[5]), \
                                               str(each[6]), \
                                               str(each[7]), \
                                               str(each[8]), \
                                               str(each[9]), \
                                               str(each[10]), \
                                               str(each[11])])
Beispiel #29
0
def sample_gene_interactions(res, args, idx_to_sample):
    # fetch variant gene dict for all samples
    samples = get_variant_genes(res, args, idx_to_sample)
    # file handle for fetching the hprd graph
    if args.edges is None:
        config = read_gemini_config(args=args)
        path_dirname = config["annotation_dir"]
        file_graph = os.path.join(path_dirname, 'hprd_interaction_edges.gz')
    else:
        file_graph = args.edges

    gr = nx.DiGraph()
    for e in xopen(file_graph):
        pair = e.strip().split("|")
        gr.add_edge(*pair)

    k = []
    variants = []
    #calculate nodes from the graph
    if args.gene is None or args.gene not in gr:
        sys.stderr.write("Gene name not found or")
        sys.stderr.write(" gene not in interaction file\n")

    elif args.gene in gr:
        genes = nx.single_source_shortest_path_length(gr, args.gene,
                                                      cutoff=args.radius)

        if args.var_mode:
            for sample in samples.iterkeys():
                var = samples[str(sample)]
                #for each level return interacting genes if they are
                # variants in the sample.
                # 0th order would be returned if the user chosen
                # gene is a variant in the sample
                for radius in range(0, (args.radius+1)):
                    for each in var:
                        for key, dist in genes.iteritems():
                            if dist == radius and key == each[0]:
                                print "\t".join([str(sample), str(args.gene), \
                                          str(radius), \
                                          str(key), \
                                          str(each[1]), \
                                          str(each[2]), \
                                          str(each[3]), \
                                          str(each[4]), \
                                          str(each[5]), \
                                          str(each[6]), \
                                          str(each[7]), \
                                          str(each[8]), \
                                          str(each[9]), \
                                          str(each[10]), \
                                          str(each[11])])
        elif (not args.var_mode):
            for sample in samples.iterkeys():
                for each in samples[str(sample)]:
                    variants.append(each[0])
                for x in range(0, (args.radius+1)):
                    for key, value in genes.iteritems():
                        if value == x and key in set(variants):
                            k.append(key)
                    if k:
                        print "\t".join([str(sample), str(args.gene), \
                                 str(x)+"_order:",
                                 ",".join(k)])
                    else:
                        print "\t".join([str(sample), str(args.gene), \
                                         str(x)+"_order:", "none"])
                    #initialize keys for next iteration
                    k = []
                #initialize variants list for next iteration
                variants = []
Beispiel #30
0
def sample_lof_interactions(res, args, idx_to_sample, samples):
    lof = get_lof_genes(res, args, idx_to_sample)
    if args.edges is None:
        config = read_gemini_config(args=args)
        path_dirname = config["annotation_dir"]
        file_graph = os.path.join(path_dirname, 'hprd_interaction_edges.gz')
    else:
        file_graph = args.edges

    gr = nx.DiGraph()
    for e in xopen(file_graph):
        pair = e.strip().split("|")
        gr.add_edge(*pair)

    #initialize keys
    k = []
    variants = []

    if (not args.var_mode):
        for sample in lof.iterkeys():
            lofvariants = list(set(lof[str(sample)]))
            for each in samples[str(sample)]:
                variants.append(each[0])
            for gene in lofvariants:
                if gene not in gr: continue
                genes = nx.single_source_shortest_path_length(gr, gene,
                                                              cutoff=args.radius)
                for rad in range(1, (args.radius+1)):
                    for key, value in genes.iteritems():
                        if (value == rad) and key in set(variants):
                            k.append(key)
                    if k:
                        print "\t".join([str(sample),
                                   str(gene),
                                   str(rad)+"_order:",
                                   ",".join(k)])
                    else:
                        print "\t".join([str(sample),
                                   str(gene),
                                   str(rad)+"_order:",
                                   "none"])
                    #initialize k
                    k = []
            #initialize variants list for next iteration
            variants = []
    elif args.var_mode:
        for sample in lof.iterkeys():
            lofvariants = list(set(lof[str(sample)]))
            var = samples[str(sample)]
            for gene in lofvariants:
                if not gene in gr: continue
                genes = nx.single_source_shortest_path_length(gr, gene,
                                                              cutoff=args.radius)

                for rad in range(1, (args.radius+1)):
                    for each in var:
                        for key, value in genes.iteritems():
                            if value == rad and key == each[0]:
                                print "\t".join([str(sample),
                                           str(gene),
                                           str(rad),
                                           str(key),
                                           str(each[1]),
                                           str(each[2]),
                                           str(each[3]),
                                           str(each[4]),
                                           str(each[5]),
                                           str(each[6]),
                                           str(each[7]),
                                           str(each[8]),
                                           str(each[9]),
                                           str(each[10]),
                                           str(each[11])])
Beispiel #31
0
def sample_lof_interactions(c, args, idx_to_sample, samples):
    lof = get_lof_genes(c, args, idx_to_sample)
    #file handle for fetching the hprd graph
    config = read_gemini_config()
    path_dirname = config["annotation_dir"]
    file_graph = os.path.join(path_dirname, 'hprd_interaction_graph')
    #load the graph using cPickle and close file handle
    gr = graph()
    f = open(file_graph, 'rb')
    gr = cPickle.load(f)
    f.close()
    #calculate nodes from the graph
    hprd_genes = gr.nodes()
    #initialize keys
    k = []
    variants = []

    if (not args.var_mode):
        for sample in lof.iterkeys():
            lofvariants = list(set(lof[str(sample)]))
            for each in samples[str(sample)]:
                variants.append(each[0])
            for gene in lofvariants:
                if gene in hprd_genes:
                    x, y = \
                        breadth_first_search(gr,root=gene,\
                        filter=radius(args.radius))

                    gst = digraph()
                    gst.add_spanning_tree(x)
                    st, sd = shortest_path(gst, gene)
                    # for each level return interacting genes
                    # if they are variants in the sample.
                    for rad in range(1, (args.radius+1)):
                        for key, value in sd.iteritems():
                            if (value == rad) and key in set(variants):
                                k.append(key)
                        if k:
                            print "\t".join([str(sample), \
                                       str(gene), \
                                       str(rad)+"_order:",
                                       ",".join(k)])
                        else:
                            print "\t".join([str(sample), \
                                       str(gene), \
                                       str(rad)+"_order:", \
                                       "none"])
                        #initialize k
                        k = []
            #initialize variants list for next iteration
            variants = []
    elif args.var_mode:
        for sample in lof.iterkeys():
            lofvariants = list(set(lof[str(sample)]))
            var = samples[str(sample)]
            for gene in lofvariants:
                if gene in hprd_genes:
                    x, y = \
                         breadth_first_search(gr,root=gene, \
                         filter=radius(args.radius))
                    gst = digraph()
                    gst.add_spanning_tree(x)
                    st, sd = shortest_path(gst, gene)
                    for rad in range(1, (args.radius+1)):
                        for each in var:
                            for key, value in sd.iteritems():
                                if value == rad and key == each[0]:
                                    print "\t".join([str(sample), \
                                               str(gene), \
                                               str(rad), \
                                               str(key), \
                                               str(each[1]), \
                                               str(each[2]), \
                                               str(each[3]), \
                                               str(each[4]), \
                                               str(each[5]), \
                                               str(each[6]), \
                                               str(each[7]), \
                                               str(each[8]), \
                                               str(each[9]), \
                                               str(each[10]), \
                                               str(each[11])])