def install_annotation_files(anno_root_dir, dl_files=False, extra=None): """Download required annotation files. """ # create the full gemini data path based on # the root dir the user provided if anno_root_dir.endswith(("gemini/data", "gemini/data/", "gemini_data")): anno_dir = anno_root_dir elif anno_root_dir.endswith(("gemini", "gemini/")): anno_dir = os.path.join(anno_root_dir, "data") else: anno_dir = os.path.join(anno_root_dir, "gemini", "data") cur_config = read_gemini_config(allow_missing=True) cur_config["annotation_dir"] = os.path.abspath(anno_dir) cur_config["versions"] = anno_versions write_gemini_config(cur_config) if dl_files: if not os.path.exists(anno_dir): os.makedirs(anno_dir) if not os.path.isdir(anno_dir): sys.exit(anno_dir + " is not a valid directory.") _check_dependencies() to_dl = anno_files[:] if extra: to_dl += [extra_anno_files[x] for x in extra] _download_anno_files("https://s3.amazonaws.com/gemini-annotations", to_dl, anno_dir, cur_config)
def update_cosmic_census_genes(session, args): """ Update the gene summary table with whether or not a given gene is in the COSMIC cancer gene census """ config = read_gemini_config(args=args) path_dirname = config["annotation_dir"] file = os.path.join(path_dirname, "cancer_gene_census.20140120.tsv") cosmic_census_genes = [] for line in open(file, "r"): fields = line.strip().split("\t") gene = fields[0] chrom = "chr" + fields[3] cosmic_census_genes.append((1, gene, chrom)) database_cassandra.update_gene_summary_w_cancer_census(session, cosmic_census_genes)
def update_cosmic_census_genes(session, args): """ Update the gene summary table with whether or not a given gene is in the COSMIC cancer gene census """ config = read_gemini_config(args=args) path_dirname = config["annotation_dir"] file = os.path.join(path_dirname, 'cancer_gene_census.20140120.tsv') cosmic_census_genes = [] for line in open(file, 'r'): fields = line.strip().split("\t") gene = fields[0] chrom = "chr" + fields[3] cosmic_census_genes.append((1, gene, chrom)) database_cassandra.update_gene_summary_w_cancer_census( session, cosmic_census_genes)
def get_anno_files( args ): config = read_gemini_config( args = args ) anno_dirname = config["annotation_dir"] # Default annotations -- always found annos = { 'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'), 'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'), 'dbsnp': os.path.join(anno_dirname, 'dbsnp.b141.20140813.hg19.vcf.gz'), 'clinvar': os.path.join(anno_dirname, 'clinvar_20140807.vcf.gz'), 'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'), 'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'), 'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'), 'conserved': os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'), 'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'), 'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'), 'esp': os.path.join(anno_dirname, 'ESP6500SI.all.snps_indels.vcf.gz'), '1000g': os.path.join(anno_dirname, 'ALL.autosomes.phase3_shapeit2_mvncall_integrated_v5.20130502.sites.vcf.gz'), 'recomb': os.path.join(anno_dirname, 'genetic_map_HapMapII_GRCh37.gz'), 'gms': os.path.join(anno_dirname, 'GRCh37-gms-mappability.vcf.gz'), 'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'), 'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"), 'encode_tfbs': os.path.join(anno_dirname, 'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'), 'encode_dnase1': os.path.join(anno_dirname, 'stam.125cells.dnaseI.hg19.bed.gz'), 'encode_consensus_segs': os.path.join(anno_dirname, 'encode.6celltypes.consensus.bedg.gz'), 'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'), 'vista_enhancers': os.path.join(anno_dirname, 'hg19.vista.enhancers.20131108.bed.gz'), 'fitcons': os.path.join(anno_dirname, "hg19_fitcons_fc-i6-0_V1-01.bw"), 'cosmic': os.path.join(anno_dirname, 'cosmic-v68-GRCh37.vcf.gz'), 'exac': os.path.join(anno_dirname, 'ExAC.r0.2.sites.vep.vcf.gz') } # optional annotations if os.path.exists(os.path.join(anno_dirname, 'hg19.gerp.bw')): annos['gerp_bp'] = os.path.join(anno_dirname, 'hg19.gerp.bw') if os.path.exists(os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')): annos['cadd_score'] = os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz') return annos
#!/usr/bin/env python """ Store the path for GEMINI data-dir """ from geminicassandra.config import read_gemini_config config = read_gemini_config() anno_dirname = config["annotation_dir"] print anno_dirname
def get_anno_files(args): config = read_gemini_config(args=args) anno_dirname = config["annotation_dir"] # Default annotations -- always found annos = { 'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'), 'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'), 'dbsnp': os.path.join(anno_dirname, 'dbsnp.b141.20140813.hg19.vcf.gz'), 'clinvar': os.path.join(anno_dirname, 'clinvar_20140807.vcf.gz'), 'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'), 'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'), 'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'), 'conserved': os.path.join( anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz' ), 'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'), 'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'), 'esp': os.path.join(anno_dirname, 'ESP6500SI.all.snps_indels.vcf.gz'), '1000g': os.path.join( anno_dirname, 'ALL.autosomes.phase3_shapeit2_mvncall_integrated_v5.20130502.sites.vcf.gz' ), 'recomb': os.path.join(anno_dirname, 'genetic_map_HapMapII_GRCh37.gz'), 'gms': os.path.join(anno_dirname, 'GRCh37-gms-mappability.vcf.gz'), 'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'), 'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"), 'encode_tfbs': os.path.join(anno_dirname, 'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'), 'encode_dnase1': os.path.join(anno_dirname, 'stam.125cells.dnaseI.hg19.bed.gz'), 'encode_consensus_segs': os.path.join(anno_dirname, 'encode.6celltypes.consensus.bedg.gz'), 'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'), 'vista_enhancers': os.path.join(anno_dirname, 'hg19.vista.enhancers.20131108.bed.gz'), 'fitcons': os.path.join(anno_dirname, "hg19_fitcons_fc-i6-0_V1-01.bw"), 'cosmic': os.path.join(anno_dirname, 'cosmic-v68-GRCh37.vcf.gz'), 'exac': os.path.join(anno_dirname, 'ExAC.r0.2.sites.vep.vcf.gz') } # optional annotations if os.path.exists(os.path.join(anno_dirname, 'hg19.gerp.bw')): annos['gerp_bp'] = os.path.join(anno_dirname, 'hg19.gerp.bw') if os.path.exists( os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')): annos['cadd_score'] = os.path.join( anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz') return annos