def get_anno_files(): config = read_gemini_config() anno_dirname = config["annotation_dir"] return { 'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'), 'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'), 'dbsnp': os.path.join(anno_dirname, 'dbsnp.137.vcf.gz'), 'clinvar': os.path.join(anno_dirname, 'clinvar_20130118.vcf.gz'), 'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'), 'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'), 'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'), 'conserved': os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'), 'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'), 'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'), 'esp': os.path.join(anno_dirname, 'ESP6500SI.all.snps_indels.vcf.gz'), '1000g': os.path.join(anno_dirname, 'ALL.wgs.integrated_phase1_v3.20101123.snps_indels_sv.sites.2012Oct12.vcf.gz'), 'recomb': os.path.join(anno_dirname, 'genetic_map_HapMapII_GRCh37.gz'), 'gms': os.path.join(anno_dirname, 'GRCh37-gms-mappability.vcf.gz'), 'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'), 'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"), 'encode_tfbs': os.path.join(anno_dirname, 'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'), 'encode_dnase1': os.path.join(anno_dirname, 'stam.125cells.dnaseI.hg19.bed.gz'), 'encode_consensus_segs': os.path.join(anno_dirname, 'encode.6celltypes.consensus.bedg.gz'), 'gerp_bp': os.path.join(anno_dirname, 'hg19.gerp.bw'), 'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'), }
def install_annotation_files(anno_root_dir): """Download required annotation files. """ # create the full gemini data path based on # the root dir the user provided if anno_root_dir.endswith(("gemini", "gemini/")): anno_dir = os.path.join(anno_root_dir, "data") else: anno_dir = os.path.join(anno_root_dir, "gemini", "data") if not os.path.exists(anno_dir): os.makedirs(anno_dir) cur_config = read_gemini_config(allow_missing=True) # download and install each of the annotation files for orig in anno_files: if orig.endswith(".gz"): dls = [orig, "%s.tbi" % orig] else: dls = [orig] for dl in dls: url = "http://people.virginia.edu/~arq5x/files/gemini/annotations/{fname}".format(fname=dl) _download_to_dir(url, anno_dir, anno_versions.get(orig, 1), cur_config.get("versions", {}).get(orig, 1)) cur_config["annotation_dir"] = anno_dir cur_config["versions"] = anno_versions write_gemini_config(cur_config)
def _get_gene_detailed(self): """ define a gene detailed table """ #unique identifier for each entry i = 0 table_contents = detailed_list = [] config = read_gemini_config(args=self.args) path_dirname = config["annotation_dir"] file_handle = os.path.join(path_dirname, 'detailed_gene_table_v75') for line in open(file_handle, 'r'): field = line.strip().split("\t") if not field[0].startswith("Chromosome"): i += 1 table = gene_table.gene_detailed(field) detailed_list = [str(i),table.chrom,table.gene,table.is_hgnc, table.ensembl_gene_id,table.ensembl_trans_id, table.biotype,table.trans_status,table.ccds_id, table.hgnc_id,table.entrez,table.cds_length,table.protein_length, table.transcript_start,table.transcript_end, table.strand,table.synonym,table.rvis,table.mam_phenotype] table_contents.append(detailed_list) database.insert_gene_detailed(self.c, table_contents)
def _get_gene_summary(self): """ define a gene summary table """ #unique identifier for each entry i = 0 contents = summary_list = [] config = read_gemini_config(args=self.args) path_dirname = config["annotation_dir"] file = os.path.join(path_dirname, 'summary_gene_table_v75') for line in open(file, 'r'): col = line.strip().split("\t") if not col[0].startswith("Chromosome"): i += 1 table = gene_table.gene_summary(col) # defaul cosmic census to False cosmic_census = 0 summary_list = [str(i),table.chrom,table.gene,table.is_hgnc, table.ensembl_gene_id,table.hgnc_id, table.transcript_min_start, table.transcript_max_end,table.strand, table.synonym,table.rvis,table.mam_phenotype, cosmic_census] contents.append(summary_list) database.insert_gene_summary(self.c, contents)
def install_annotation_files(anno_root_dir, dl_files=False, extra=None): """Download required annotation files. """ # create the full gemini data path based on # the root dir the user provided if anno_root_dir.endswith(("gemini/data", "gemini/data/", "gemini_data")): anno_dir = anno_root_dir elif anno_root_dir.endswith(("gemini", "gemini/")): anno_dir = os.path.join(anno_root_dir, "data") else: anno_dir = os.path.join(anno_root_dir, "gemini", "data") cur_config = read_gemini_config(allow_missing=True) cur_config["annotation_dir"] = os.path.abspath(anno_dir) cur_config["versions"] = anno_versions write_gemini_config(cur_config) if dl_files: if not os.path.exists(anno_dir): os.makedirs(anno_dir) if not os.path.isdir(anno_dir): sys.exit(anno_dir + " is not a valid directory.") _check_dependencies() to_dl = anno_files[:] if extra: to_dl += [extra_anno_files[x] for x in extra] _download_anno_files("https://s3.amazonaws.com/gemini-annotations", to_dl, anno_dir, cur_config)
def get_gemini_files(data): """Enumerate available gemini data files in a standard installation. """ try: from gemini import annotations, config except ImportError: return {} return {"base": config.read_gemini_config()["annotation_dir"], "files": annotations.get_anno_files().values()}
def get_anno_files(): config = read_gemini_config() anno_dirname = config["annotation_dir"] return { 'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'), 'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'), 'dbsnp': os.path.join(anno_dirname, 'dbsnp.137.vcf.gz'), 'clinvar': os.path.join(anno_dirname, 'clinvar_20130118.vcf.gz'), 'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'), 'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'), 'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'), 'conserved': os.path.join( anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz' ), 'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'), 'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'), 'esp': os.path.join(anno_dirname, 'ESP6500SI.all.snps_indels.vcf.gz'), '1000g': os.path.join( anno_dirname, 'ALL.wgs.integrated_phase1_v3.20101123.snps_indels_sv.sites.2012Oct12.vcf.gz' ), 'recomb': os.path.join(anno_dirname, 'genetic_map_HapMapII_GRCh37.gz'), 'gms': os.path.join(anno_dirname, 'GRCh37-gms-mappability.vcf.gz'), 'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'), 'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"), 'encode_tfbs': os.path.join(anno_dirname, 'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'), 'encode_dnase1': os.path.join(anno_dirname, 'stam.125cells.dnaseI.hg19.bed.gz'), 'encode_consensus_segs': os.path.join(anno_dirname, 'encode.6celltypes.consensus.bedg.gz'), 'gerp_bp': os.path.join(anno_dirname, 'hg19.gerp.bw'), 'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'), }
def get_pathways(args): version_dic = defaultdict() version_dic = { '66': 'kegg_pathways_ensembl66', '67': 'kegg_pathways_ensembl67', '68': 'kegg_pathways_ensembl68', '69': 'kegg_pathways_ensembl69', '70': 'kegg_pathways_ensembl70', '71': 'kegg_pathways_ensembl71' } config = read_gemini_config(args=args) path_dirname = config["annotation_dir"] if args.version in version_dic: path_file = os.path.join(path_dirname, version_dic[args.version]) else: sys.exit("Unsupported Ensembl gene version.\n") agn_paths = defaultdict(list) hgnc_paths = defaultdict(list) ensembl_paths = defaultdict(list) for line in open(path_file, 'r'): fields = line.strip().split("\t") uniprot = fields[0] agn = fields[1] hgnc = fields[2] ensid = fields[3] ens_transcript = fields[4] hsa = fields[5] path = fields[6] if fields[6] != 'None' else None # clean up the pathways such that this: # path:hsa00260;Glycine_serine_and_threonine_metabolism # becomes this: # hsa00260:Glycine_serine_and_threonine_metabolism if path is not None and path.startswith("path:"): path = path[5:] path = path.replace(";", ":") # build gene/transcript -> pathway mappings using # all three gene naming conventions agn_paths[(agn, ens_transcript)].append(path) hgnc_paths[(hgnc, ens_transcript)].append(path) ensembl_paths[(ensid, ens_transcript)].append(path) return agn_paths, hgnc_paths, ensembl_paths
def _get_gene_detailed(self): """ define a gene detailed table """ #unique identifier for each entry i = 0 table_contents = detailed_list = [] config = read_gemini_config() path_dirname = config["annotation_dir"] file_handle = os.path.join(path_dirname, 'detailed_gene_table_v75') header= ['uid','chrom','gene','is_hgnc','ensembl_gene_id','transcript','biotype','transcript_status','ccds_id','hgnc_id',\ 'entrez_id','cds_length','protein_length','transcript_start','transcript_end','strand','synonym','rvis_pct','mam_phenotype_id'] import csv with open('../gene_detailed.csv', 'wb') as csvfile: rowwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) rowwriter.writerow(header) for line in open(file_handle, 'r'): field = line.strip().split("\t") if not field[0].startswith("Chromosome"): i += 1 table = gene_table.gene_detailed(field) detailed_list = [str(i),table.chrom,table.gene,table.is_hgnc, table.ensembl_gene_id,table.ensembl_trans_id, table.biotype,table.trans_status,table.ccds_id, table.hgnc_id,table.entrez,table.cds_length,table.protein_length, table.transcript_start,table.transcript_end, table.strand,table.synonym,table.rvis,table.mam_phenotype] rowwriter.writerow(detailed_list) # if(i==5): # print detailed_list # table_contents.append(detailed_list) #database.insert_gene_detailed(self.c, table_contents) """
def install_annotation_files(anno_root_dir): """Download required annotation files. """ # create the full gemini data path based on # the root dir the user provided if anno_root_dir.endswith(("gemini/data", "gemini/data/")): anno_dir = anno_root_dir elif anno_root_dir.endswith(("gemini", "gemini/")): anno_dir = os.path.join(anno_root_dir, "data") else: anno_dir = os.path.join(anno_root_dir, "gemini", "data") if not os.path.exists(anno_dir): os.makedirs(anno_dir) cur_config = read_gemini_config(allow_missing=True) _download_anno_files("https://s3.amazonaws.com/gemini-annotations", anno_files, anno_dir, cur_config)
def update_cosmic_census_genes(cursor, args): """ Update the gene summary table with whether or not a given gene is in the COSMIC cancer gene census """ config = read_gemini_config(args=args) path_dirname = config["annotation_dir"] file = os.path.join(path_dirname, 'cancer_gene_census.20140120.tsv') cosmic_census_genes = [] for line in open(file, 'r'): fields = line.strip().split("\t") gene = fields[0] chrom = "chr" + fields[3] cosmic_census_genes.append((1, gene, chrom)) database.update_gene_summary_w_cancer_census(cursor, cosmic_census_genes)
def update_cosmic_census_genes(session, metadata, args): """ Update the gene summary table with whether or not a given gene is in the COSMIC cancer gene census """ config = read_gemini_config(args=args) path_dirname = config["annotation_dir"] file = os.path.join(path_dirname, 'cancer_gene_census.20140120.tsv') cosmic_census_genes = [] for line in open(file, 'r'): fields = line.strip().split("\t") gene = fields[0] chrom = "chr" + fields[3] cosmic_census_genes.append((1, gene, chrom)) database.update_gene_summary_w_cancer_census(session, metadata, cosmic_census_genes)
def get_anno_files( args ): config = read_gemini_config( args = args ) anno_dirname = config["annotation_dir"] # Default annotations -- always found annos = { 'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'), 'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'), 'dbsnp': os.path.join(anno_dirname, 'dbsnp.b141.20140813.hg19.tidy.vcf.gz'), 'clinvar': os.path.join(anno_dirname, 'clinvar_20150305.tidy.vcf.gz'), 'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'), 'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'), 'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'), 'conserved': os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'), 'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'), 'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'), 'esp': os.path.join(anno_dirname, 'ESP6500SI.all.snps_indels.tidy.v2.vcf.gz'), '1000g': os.path.join(anno_dirname, 'ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.tidy.vcf.gz'), 'recomb': os.path.join(anno_dirname, 'genetic_map_HapMapII_GRCh37.gz'), 'gms': os.path.join(anno_dirname, 'GRCh37-gms-mappability.vcf.gz'), 'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'), 'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"), 'encode_tfbs': os.path.join(anno_dirname, 'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'), 'encode_dnase1': os.path.join(anno_dirname, 'stam.125cells.dnaseI.hg19.bed.gz'), 'encode_consensus_segs': os.path.join(anno_dirname, 'encode.6celltypes.consensus.bedg.gz'), 'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'), 'vista_enhancers': os.path.join(anno_dirname, 'hg19.vista.enhancers.20131108.bed.gz'), 'fitcons': os.path.join(anno_dirname, "hg19_fitcons_fc-i6-0_V1-01.bed.gz"), 'cosmic': os.path.join(anno_dirname, 'cosmic-v68-GRCh37.tidy.vcf.gz'), 'exac': os.path.join(anno_dirname, 'ExAC.r0.3.sites.vep.tidy.vcf.gz'), 'geno2mp': os.path.join(anno_dirname, 'geno2mp.variants.tidy.vcf.gz'), } # optional annotations if os.path.exists(os.path.join(anno_dirname, 'hg19.gerp.bw')): annos['gerp_bp'] = os.path.join(anno_dirname, 'hg19.gerp.bw') if os.path.exists(os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')): annos['cadd_score'] = os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz') return annos
def get_pathways(args): version_dic = defaultdict() version_dic = {'66': 'kegg_pathways_ensembl66', '67': 'kegg_pathways_ensembl67', '68': 'kegg_pathways_ensembl68', '69': 'kegg_pathways_ensembl69', '70': 'kegg_pathways_ensembl70', '71': 'kegg_pathways_ensembl71'} config = read_gemini_config( args = args ) path_dirname = config["annotation_dir"] if args.version in version_dic: path_file = os.path.join(path_dirname, version_dic[args.version]) else: raise NotImplementedError("Unsupported Ensembl gene version.\n") agn_paths = defaultdict(list) hgnc_paths = defaultdict(list) ensembl_paths = defaultdict(list) for line in open(path_file, 'r'): fields=line.strip().split("\t") uniprot = fields[0] agn = fields[1] hgnc = fields[2] ensid = fields[3] ens_transcript = fields[4] hsa = fields[5] path = fields[6] if fields[6] != 'None' else None # clean up the pathways such that this: # path:hsa00260;Glycine_serine_and_threonine_metabolism # becomes this: # hsa00260:Glycine_serine_and_threonine_metabolism if path is not None and path.startswith("path:"): path = path[5:] path = path.replace(";", ":") # build gene/transcript -> pathway mappings using # all three gene naming conventions agn_paths[(agn, ens_transcript)].append(path) hgnc_paths[(hgnc, ens_transcript)].append(path) ensembl_paths[(ensid, ens_transcript)].append(path) return agn_paths, hgnc_paths, ensembl_paths
def get_anno_files(args): config = read_gemini_config(args=args) anno_dirname = config["annotation_dir"] # Default annotations -- always found annos = { 'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'), 'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'), 'dbsnp': os.path.join(anno_dirname, 'dbsnp.b141.20140813.hg19.tidy.vcf.gz'), 'clinvar': os.path.join(anno_dirname, 'clinvar_20160203.tidy.vcf.gz'), 'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'), 'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'), 'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'), 'conserved': os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'), 'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'), 'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'), 'esp': os.path.join(anno_dirname, 'ESP6500SI.all.snps_indels.tidy.v2.vcf.gz'), '1000g': os.path.join(anno_dirname, 'ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.tidy.vcf.gz'), 'recomb': os.path.join(anno_dirname, 'genetic_map_HapMapII_GRCh37.gz'), 'gms': os.path.join(anno_dirname, 'GRCh37-gms-mappability.vcf.gz'), 'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'), 'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"), 'encode_tfbs': os.path.join(anno_dirname, 'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'), 'encode_dnase1': os.path.join(anno_dirname, 'stam.125cells.dnaseI.hg19.bed.gz'), 'encode_consensus_segs': os.path.join(anno_dirname, 'encode.6celltypes.consensus.bedg.gz'), 'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'), 'vista_enhancers': os.path.join(anno_dirname, 'hg19.vista.enhancers.20131108.bed.gz'), 'fitcons': os.path.join(anno_dirname, "hg19_fitcons_fc-i6-0_V1-01.bed.gz"), 'cosmic': os.path.join(anno_dirname, 'cosmic-v68-GRCh37.tidy.vcf.gz'), 'exac': os.path.join(anno_dirname, 'ExAC.r0.3.sites.vep.tidy.vcf.gz'), 'geno2mp': os.path.join(anno_dirname, 'geno2mp.variants.tidy.vcf.gz'), } # optional annotations if os.path.exists(os.path.join(anno_dirname, 'hg19.gerp.bw')): annos['gerp_bp'] = os.path.join(anno_dirname, 'hg19.gerp.bw') if os.path.exists(os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')): annos['cadd_score'] = os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz') return annos
def load_annos(): """ Populate a dictionary of Tabixfile handles for each annotation file. Other modules can then access a given handle and fetch data from it as follows: dbsnp_handle = annotations.annos['dbsnp'] hits = dbsnp_handle.fetch(chrom, start, end) """ config = read_gemini_config() anno_dirname = config["annotation_dir"] anno_files = { 'cytoband' : os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'), 'dbsnp' : os.path.join(anno_dirname, 'dbsnp.137.vcf.gz'), 'gwas' : os.path.join(anno_dirname, 'hg19.gwas.bed.gz'), 'rmsk' : os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'), 'segdup' : os.path.join(anno_dirname, 'hg19.segdup.bed.gz'), 'conserved' : os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'), 'cpg_island' : os.path.join(anno_dirname, 'hg19.CpG.bed.gz'), 'dgv' : os.path.join(anno_dirname, 'hg19.dgv.bed.gz'), 'esp' : os.path.join(anno_dirname, \ 'ESP6500SI.all.snps_indels.vcf.gz'), '1000g' : os.path.join(anno_dirname, \ 'ALL.wgs.integrated_phase1_v3.20101123.snps_indels_sv.sites.2012Oct12.vcf.gz'), 'recomb' : os.path.join(anno_dirname, \ 'genetic_map_HapMapII_GRCh37.gz'), 'gms' : os.path.join(anno_dirname, \ 'GRCh37-gms-mappability.vcf.gz'), 'grc' : os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'), 'encode_tfbs' : os.path.join(anno_dirname, \ 'wgEncodeRegTfbsClusteredV2.cell_count.bed.gz'), 'encode_consensus_segs': os.path.join(anno_dirname, \ 'encode.6celltypes.consensus.bedg.gz'), 'encode_segway_segs' : os.path.join(anno_dirname, \ 'encode.6celltypes.segway.bedg.gz'), 'encode_chromhmm_segs' : os.path.join(anno_dirname, \ 'encode.6celltypes.chromhmm.bedg.gz') } for anno in anno_files: annos[anno] = pysam.Tabixfile(anno_files[anno])
def get_anno_files(): config = read_gemini_config() anno_dirname = config["annotation_dir"] # Default annotations -- always found annos = { 'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'), 'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'), 'dbsnp': os.path.join(anno_dirname, 'dbsnp.138.vcf.gz'), 'clinvar': os.path.join(anno_dirname, 'clinvar_20140303.vcf.gz'), 'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'), 'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'), 'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'), 'conserved': os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'), 'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'), 'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'), 'esp': os.path.join(anno_dirname, 'ESP6500SI.all.snps_indels.vcf.gz'), '1000g': os.path.join(anno_dirname, 'ALL.wgs.integrated_phase1_v3.20101123.snps_indels_sv.sites.2012Oct12.vcf.gz'), 'recomb': os.path.join(anno_dirname, 'genetic_map_HapMapII_GRCh37.gz'), 'gms': os.path.join(anno_dirname, 'GRCh37-gms-mappability.vcf.gz'), 'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'), 'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"), 'encode_tfbs': os.path.join(anno_dirname, 'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'), 'encode_dnase1': os.path.join(anno_dirname, 'stam.125cells.dnaseI.hg19.bed.gz'), 'encode_consensus_segs': os.path.join(anno_dirname, 'encode.6celltypes.consensus.bedg.gz'), 'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'), 'vista_enhancers': os.path.join(anno_dirname, 'hg19.vista.enhancers.20131108.bed.gz'), 'cosmic': os.path.join(anno_dirname, 'hg19.cosmic.v67.20131024.gz') } # optional annotations if os.path.exists(os.path.join(anno_dirname, 'hg19.gerp.bw')): annos['gerp_bp'] = os.path.join(anno_dirname, 'hg19.gerp.bw') if os.path.exists(os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')): annos['cadd_score'] = os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz') return annos
def install_annotation_files(anno_root_dir): """Download required annotation files. """ # create the full gemini data path based on # the root dir the user provided if anno_root_dir.endswith(("gemini", "gemini/")): anno_dir = os.path.join(anno_root_dir, "data") else: anno_dir = os.path.join(anno_root_dir, "gemini", "data") if not os.path.exists(anno_dir): os.makedirs(anno_dir) cur_config = read_gemini_config(allow_missing=True) cur_config["annotation_dir"] = anno_dir write_gemini_config(cur_config) # download and install each of the annotation files for dl in anno_files: url = "http://people.virginia.edu/~arq5x/files/gemini/annotations/{fname}".format( fname=dl) _download_to_dir(url, anno_dir)
def sample_gene_interactions(c, args, idx_to_sample): out = open("file.dot", 'w') #fetch variant gene dict for all samples samples = get_variant_genes(c, args, idx_to_sample) #file handle for fetching the hprd graph config = read_gemini_config() path_dirname = config["annotation_dir"] file_graph = os.path.join(path_dirname, 'hprd_interaction_graph') #load the graph using cPickle and close file handle gr = graph() f = open(file_graph, 'rb') gr = cPickle.load(f) f.close() k = [] variants = [] #calculate nodes from the graph hprd_genes = gr.nodes() if args.gene == None or args.gene not in hprd_genes: sys.stderr.write("Gene name not found or") sys.stderr.write(" gene not in p-p interaction file\n") elif args.gene in hprd_genes: x, y = \ breadth_first_search(gr,root=args.gene,filter=radius(args.radius)) gst = digraph() gst.add_spanning_tree(x) dot = write(gst) out.write(dot) st, sd = shortest_path(gst, args.gene) if args.var_mode: for sample in samples.iterkeys(): var = samples[str(sample)] #for each level return interacting genes if they are # variants in the sample. # 0th order would be returned if the user chosen # gene is a variant in the sample for x in range(0, (args.radius+1)): for each in var: for key, value in sd.iteritems(): if value == x and key == each[0]: print "\t".join([str(sample),str(args.gene), \ str(x), \ str(key), \ str(each[1]), \ str(each[2]), \ str(each[3]), \ str(each[4]), \ str(each[5]), \ str(each[6]), \ str(each[7]), \ str(each[8]), \ str(each[9]), \ str(each[10]), \ str(each[11])]) elif (not args.var_mode): for sample in samples.iterkeys(): for each in samples[str(sample)]: variants.append(each[0]) for x in range(0, (args.radius+1)): for key, value in sd.iteritems(): if value == x and key in set(variants): k.append(key) if k: print "\t".join([str(sample), str(args.gene), \ str(x)+"_order:", ",".join(k)]) else: print "\t".join([str(sample), str(args.gene), \ str(x)+"_order:", "none"]) #initialize keys for next iteration k = [] #initialize variants list for next iteration variants = []
#!/usr/bin/env python import os import sys import sqlite3 import numpy as np import cPickle import zlib from collections import defaultdict from gemini.config import read_gemini_config import gemini_utils as util from gemini_constants import * config = read_gemini_config() path_dirname = config["annotation_dir"] def get_pathways(args): if args.version == None or args.version == '66': path_file = os.path.join(path_dirname, 'kegg_pathways_ensembl66') elif args.version == '67': path_file = os.path.join(path_dirname, 'kegg_pathways_ensembl67') elif args.version == '68': path_file = os.path.join(path_dirname, 'kegg_pathways_ensembl68') else: sys.exit("Unsupported Ensembl gene version.\n") agn_paths = defaultdict(list) hgnc_paths = defaultdict(list) ensembl_paths = defaultdict(list)
def sample_lof_interactions(res, args, idx_to_sample, samples): lof = get_lof_genes(res, args, idx_to_sample) if args.edges is None: config = read_gemini_config(args=args) path_dirname = config["annotation_dir"] file_graph = os.path.join(path_dirname, 'hprd_interaction_edges.gz') else: file_graph = args.edges gr = nx.DiGraph() for e in xopen(file_graph): pair = e.strip().split("|") gr.add_edge(*pair) #initialize keys k = [] variants = [] if (not args.var_mode): for sample in lof.iterkeys(): lofvariants = list(set(lof[str(sample)])) for each in samples[str(sample)]: variants.append(each[0]) for gene in lofvariants: if gene not in gr: continue genes = nx.single_source_shortest_path_length( gr, gene, cutoff=args.radius) for rad in range(1, (args.radius + 1)): for key, value in genes.iteritems(): if (value == rad) and key in set(variants): k.append(key) if k: print "\t".join([ str(sample), str(gene), str(rad) + "_order:", ",".join(k) ]) else: print "\t".join([ str(sample), str(gene), str(rad) + "_order:", "none" ]) #initialize k k = [] #initialize variants list for next iteration variants = [] elif args.var_mode: for sample in lof.iterkeys(): lofvariants = list(set(lof[str(sample)])) var = samples[str(sample)] for gene in lofvariants: if not gene in gr: continue genes = nx.single_source_shortest_path_length( gr, gene, cutoff=args.radius) for rad in range(1, (args.radius + 1)): for each in var: for key, value in genes.iteritems(): if value == rad and key == each[0]: print "\t".join([ str(sample), str(gene), str(rad), str(key), str(each[1]), str(each[2]), str(each[3]), str(each[4]), str(each[5]), str(each[6]), str(each[7]), str(each[8]), str(each[9]), str(each[10]), str(each[11]) ])
def sample_gene_interactions(res, args, idx_to_sample): # fetch variant gene dict for all samples samples = get_variant_genes(res, args, idx_to_sample) # file handle for fetching the hprd graph if args.edges is None: config = read_gemini_config(args=args) path_dirname = config["annotation_dir"] file_graph = os.path.join(path_dirname, 'hprd_interaction_edges.gz') else: file_graph = args.edges gr = nx.DiGraph() for e in xopen(file_graph): pair = e.strip().split("|") gr.add_edge(*pair) k = [] variants = [] #calculate nodes from the graph if args.gene is None or args.gene not in gr: sys.stderr.write("Gene name not found or") sys.stderr.write(" gene not in interaction file\n") elif args.gene in gr: genes = nx.single_source_shortest_path_length(gr, args.gene, cutoff=args.radius) if args.var_mode: for sample in samples.iterkeys(): var = samples[str(sample)] #for each level return interacting genes if they are # variants in the sample. # 0th order would be returned if the user chosen # gene is a variant in the sample for radius in range(0, (args.radius + 1)): for each in var: for key, dist in genes.iteritems(): if dist == radius and key == each[0]: print "\t".join([str(sample), str(args.gene), \ str(radius), \ str(key), \ str(each[1]), \ str(each[2]), \ str(each[3]), \ str(each[4]), \ str(each[5]), \ str(each[6]), \ str(each[7]), \ str(each[8]), \ str(each[9]), \ str(each[10]), \ str(each[11])]) elif (not args.var_mode): for sample in samples.iterkeys(): for each in samples[str(sample)]: variants.append(each[0]) for x in range(0, (args.radius + 1)): for key, value in genes.iteritems(): if value == x and key in set(variants): k.append(key) if k: print "\t".join([str(sample), str(args.gene), \ str(x)+"_order:", ",".join(k)]) else: print "\t".join([str(sample), str(args.gene), \ str(x)+"_order:", "none"]) #initialize keys for next iteration k = [] #initialize variants list for next iteration variants = []
def load_annos(): """ Populate a dictionary of Tabixfile handles for each annotation file. Other modules can then access a given handle and fetch data from it as follows: dbsnp_handle = annotations.annos['dbsnp'] hits = dbsnp_handle.fetch(chrom, start, end) """ config = read_gemini_config() anno_dirname = config["annotation_dir"] anno_files = { 'cytoband' : \ os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'), 'dbsnp' : \ os.path.join(anno_dirname, 'dbsnp.135.vcf.gz'), 'gwas' : \ os.path.join(anno_dirname, 'hg19.gwas.bed.gz'), 'rmsk' : \ os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'), 'segdup' : \ os.path.join(anno_dirname, 'hg19.segdup.bed.gz'), 'conserved' : \ os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'), 'cpg_island' : \ os.path.join(anno_dirname, 'hg19.CpG.bed.gz'), 'dgv' : \ os.path.join(anno_dirname, 'hg19.dgv.bed.gz'), 'esp' : \ os.path.join(anno_dirname, 'ESP5400.all.snps.vcf.gz'), '1000g' : \ os.path.join(anno_dirname, 'ALL.wgs.phase1_release_v3.20101123.snps_indels_sv.sites.vcf.gz'), 'recomb' : \ os.path.join(anno_dirname, 'genetic_map_HapMapII_GRCh37.gz'), 'gms' : \ os.path.join(anno_dirname, 'GRCh37-gms-mappability.vcf.gz'), 'grc' : \ os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'), 'encode_tfbs' : \ os.path.join(anno_dirname, 'wgEncodeRegTfbsClusteredV2.cell_count.bed.gz'), 'encode_consensus_segs' : \ os.path.join(anno_dirname, \ 'encode.6celltypes.consensus.bedg.gz'), 'encode_segway_segs' : \ os.path.join(anno_dirname, \ 'encode.6celltypes.segway.bedg.gz'), 'encode_chromhmm_segs' : \ os.path.join(anno_dirname, \ 'encode.6celltypes.chromhmm.bedg.gz') } for anno in anno_files: annos[anno] = pysam.Tabixfile(anno_files[anno])
#!/usr/bin/env python import os import sys import re import sqlite3 import numpy as np import cPickle import zlib from collections import defaultdict from gemini.config import read_gemini_config import gemini_utils as util from gemini_constants import * config = read_gemini_config() path_dirname = config["annotation_dir"] def get_ind_lof(c, args): idx_to_sample = util.map_indicies_to_samples(c) query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \ v.impact, v.aa_change, v.aa_length, \ v.gt_types, v.gts, i.gene, \ i.transcript, i.biotype\ FROM variants v, variant_impacts i \ WHERE v.variant_id = i.variant_id \ AND i.is_lof='1' \ AND v.type = 'snp'"
def sample_gene_interactions(c, args, idx_to_sample): out = open("file.dot", 'w') #fetch variant gene dict for all samples samples = get_variant_genes(c, args, idx_to_sample) #file handle for fetching the hprd graph config = read_gemini_config(args=args) path_dirname = config["annotation_dir"] file_graph = os.path.join(path_dirname, 'hprd_interaction_graph') #load the graph using cPickle and close file handle gr = graph() f = open(file_graph, 'rb') gr = cPickle.load(f) f.close() k = [] variants = [] #calculate nodes from the graph hprd_genes = gr.nodes() if args.gene == None or args.gene not in hprd_genes: sys.stderr.write("Gene name not found or") sys.stderr.write(" gene not in p-p interaction file\n") elif args.gene in hprd_genes: x, y = \ breadth_first_search(gr,root=args.gene,filter=radius(args.radius)) gst = digraph() gst.add_spanning_tree(x) dot = write(gst) out.write(dot) st, sd = shortest_path(gst, args.gene) if args.var_mode: for sample in samples.iterkeys(): var = samples[str(sample)] #for each level return interacting genes if they are # variants in the sample. # 0th order would be returned if the user chosen # gene is a variant in the sample for x in range(0, (args.radius + 1)): for each in var: for key, value in sd.iteritems(): if value == x and key == each[0]: print "\t".join([str(sample),str(args.gene), \ str(x), \ str(key), \ str(each[1]), \ str(each[2]), \ str(each[3]), \ str(each[4]), \ str(each[5]), \ str(each[6]), \ str(each[7]), \ str(each[8]), \ str(each[9]), \ str(each[10]), \ str(each[11])]) elif (not args.var_mode): for sample in samples.iterkeys(): for each in samples[str(sample)]: variants.append(each[0]) for x in range(0, (args.radius + 1)): for key, value in sd.iteritems(): if value == x and key in set(variants): k.append(key) if k: print "\t".join([str(sample), str(args.gene), \ str(x)+"_order:", ",".join(k)]) else: print "\t".join([str(sample), str(args.gene), \ str(x)+"_order:", "none"]) #initialize keys for next iteration k = [] #initialize variants list for next iteration variants = []
def sample_lof_interactions(c, args, idx_to_sample, samples): lof = get_lof_genes(c, args, idx_to_sample) #file handle for fetching the hprd graph config = read_gemini_config(args=args) path_dirname = config["annotation_dir"] file_graph = os.path.join(path_dirname, 'hprd_interaction_graph') #load the graph using cPickle and close file handle gr = graph() f = open(file_graph, 'rb') gr = cPickle.load(f) f.close() #calculate nodes from the graph hprd_genes = gr.nodes() #initialize keys k = [] variants = [] if (not args.var_mode): for sample in lof.iterkeys(): lofvariants = list(set(lof[str(sample)])) for each in samples[str(sample)]: variants.append(each[0]) for gene in lofvariants: if gene in hprd_genes: x, y = \ breadth_first_search(gr,root=gene,\ filter=radius(args.radius)) gst = digraph() gst.add_spanning_tree(x) st, sd = shortest_path(gst, gene) # for each level return interacting genes # if they are variants in the sample. for rad in range(1, (args.radius + 1)): for key, value in sd.iteritems(): if (value == rad) and key in set(variants): k.append(key) if k: print "\t".join([str(sample), \ str(gene), \ str(rad)+"_order:", ",".join(k)]) else: print "\t".join([str(sample), \ str(gene), \ str(rad)+"_order:", \ "none"]) #initialize k k = [] #initialize variants list for next iteration variants = [] elif args.var_mode: for sample in lof.iterkeys(): lofvariants = list(set(lof[str(sample)])) var = samples[str(sample)] for gene in lofvariants: if gene in hprd_genes: x, y = \ breadth_first_search(gr,root=gene, \ filter=radius(args.radius)) gst = digraph() gst.add_spanning_tree(x) st, sd = shortest_path(gst, gene) for rad in range(1, (args.radius + 1)): for each in var: for key, value in sd.iteritems(): if value == rad and key == each[0]: print "\t".join([str(sample), \ str(gene), \ str(rad), \ str(key), \ str(each[1]), \ str(each[2]), \ str(each[3]), \ str(each[4]), \ str(each[5]), \ str(each[6]), \ str(each[7]), \ str(each[8]), \ str(each[9]), \ str(each[10]), \ str(each[11])])
def sample_gene_interactions(res, args, idx_to_sample): # fetch variant gene dict for all samples samples = get_variant_genes(res, args, idx_to_sample) # file handle for fetching the hprd graph if args.edges is None: config = read_gemini_config(args=args) path_dirname = config["annotation_dir"] file_graph = os.path.join(path_dirname, 'hprd_interaction_edges.gz') else: file_graph = args.edges gr = nx.DiGraph() for e in xopen(file_graph): pair = e.strip().split("|") gr.add_edge(*pair) k = [] variants = [] #calculate nodes from the graph if args.gene is None or args.gene not in gr: sys.stderr.write("Gene name not found or") sys.stderr.write(" gene not in interaction file\n") elif args.gene in gr: genes = nx.single_source_shortest_path_length(gr, args.gene, cutoff=args.radius) if args.var_mode: for sample in samples.iterkeys(): var = samples[str(sample)] #for each level return interacting genes if they are # variants in the sample. # 0th order would be returned if the user chosen # gene is a variant in the sample for radius in range(0, (args.radius+1)): for each in var: for key, dist in genes.iteritems(): if dist == radius and key == each[0]: print "\t".join([str(sample), str(args.gene), \ str(radius), \ str(key), \ str(each[1]), \ str(each[2]), \ str(each[3]), \ str(each[4]), \ str(each[5]), \ str(each[6]), \ str(each[7]), \ str(each[8]), \ str(each[9]), \ str(each[10]), \ str(each[11])]) elif (not args.var_mode): for sample in samples.iterkeys(): for each in samples[str(sample)]: variants.append(each[0]) for x in range(0, (args.radius+1)): for key, value in genes.iteritems(): if value == x and key in set(variants): k.append(key) if k: print "\t".join([str(sample), str(args.gene), \ str(x)+"_order:", ",".join(k)]) else: print "\t".join([str(sample), str(args.gene), \ str(x)+"_order:", "none"]) #initialize keys for next iteration k = [] #initialize variants list for next iteration variants = []
def sample_lof_interactions(res, args, idx_to_sample, samples): lof = get_lof_genes(res, args, idx_to_sample) if args.edges is None: config = read_gemini_config(args=args) path_dirname = config["annotation_dir"] file_graph = os.path.join(path_dirname, 'hprd_interaction_edges.gz') else: file_graph = args.edges gr = nx.DiGraph() for e in xopen(file_graph): pair = e.strip().split("|") gr.add_edge(*pair) #initialize keys k = [] variants = [] if (not args.var_mode): for sample in lof.iterkeys(): lofvariants = list(set(lof[str(sample)])) for each in samples[str(sample)]: variants.append(each[0]) for gene in lofvariants: if gene not in gr: continue genes = nx.single_source_shortest_path_length(gr, gene, cutoff=args.radius) for rad in range(1, (args.radius+1)): for key, value in genes.iteritems(): if (value == rad) and key in set(variants): k.append(key) if k: print "\t".join([str(sample), str(gene), str(rad)+"_order:", ",".join(k)]) else: print "\t".join([str(sample), str(gene), str(rad)+"_order:", "none"]) #initialize k k = [] #initialize variants list for next iteration variants = [] elif args.var_mode: for sample in lof.iterkeys(): lofvariants = list(set(lof[str(sample)])) var = samples[str(sample)] for gene in lofvariants: if not gene in gr: continue genes = nx.single_source_shortest_path_length(gr, gene, cutoff=args.radius) for rad in range(1, (args.radius+1)): for each in var: for key, value in genes.iteritems(): if value == rad and key == each[0]: print "\t".join([str(sample), str(gene), str(rad), str(key), str(each[1]), str(each[2]), str(each[3]), str(each[4]), str(each[5]), str(each[6]), str(each[7]), str(each[8]), str(each[9]), str(each[10]), str(each[11])])
def sample_lof_interactions(c, args, idx_to_sample, samples): lof = get_lof_genes(c, args, idx_to_sample) #file handle for fetching the hprd graph config = read_gemini_config() path_dirname = config["annotation_dir"] file_graph = os.path.join(path_dirname, 'hprd_interaction_graph') #load the graph using cPickle and close file handle gr = graph() f = open(file_graph, 'rb') gr = cPickle.load(f) f.close() #calculate nodes from the graph hprd_genes = gr.nodes() #initialize keys k = [] variants = [] if (not args.var_mode): for sample in lof.iterkeys(): lofvariants = list(set(lof[str(sample)])) for each in samples[str(sample)]: variants.append(each[0]) for gene in lofvariants: if gene in hprd_genes: x, y = \ breadth_first_search(gr,root=gene,\ filter=radius(args.radius)) gst = digraph() gst.add_spanning_tree(x) st, sd = shortest_path(gst, gene) # for each level return interacting genes # if they are variants in the sample. for rad in range(1, (args.radius+1)): for key, value in sd.iteritems(): if (value == rad) and key in set(variants): k.append(key) if k: print "\t".join([str(sample), \ str(gene), \ str(rad)+"_order:", ",".join(k)]) else: print "\t".join([str(sample), \ str(gene), \ str(rad)+"_order:", \ "none"]) #initialize k k = [] #initialize variants list for next iteration variants = [] elif args.var_mode: for sample in lof.iterkeys(): lofvariants = list(set(lof[str(sample)])) var = samples[str(sample)] for gene in lofvariants: if gene in hprd_genes: x, y = \ breadth_first_search(gr,root=gene, \ filter=radius(args.radius)) gst = digraph() gst.add_spanning_tree(x) st, sd = shortest_path(gst, gene) for rad in range(1, (args.radius+1)): for each in var: for key, value in sd.iteritems(): if value == rad and key == each[0]: print "\t".join([str(sample), \ str(gene), \ str(rad), \ str(key), \ str(each[1]), \ str(each[2]), \ str(each[3]), \ str(each[4]), \ str(each[5]), \ str(each[6]), \ str(each[7]), \ str(each[8]), \ str(each[9]), \ str(each[10]), \ str(each[11])])