Example #1
0
def install_annotation_files(anno_root_dir, dl_files=False, extra=None):
    """Download required annotation files.
    """
    # create the full gemini data path based on
    # the root dir the user provided
    if anno_root_dir.endswith(("gemini/data", "gemini/data/", "gemini_data")):
        anno_dir = anno_root_dir
    elif anno_root_dir.endswith(("gemini", "gemini/")):
        anno_dir = os.path.join(anno_root_dir, "data")
    else:
        anno_dir = os.path.join(anno_root_dir, "gemini", "data")

    cur_config = read_gemini_config(allow_missing=True)
    cur_config["annotation_dir"] = os.path.abspath(anno_dir)
    cur_config["versions"] = anno_versions
    write_gemini_config(cur_config)

    if dl_files:
        if not os.path.exists(anno_dir):
            os.makedirs(anno_dir)
        if not os.path.isdir(anno_dir):
            sys.exit(anno_dir + " is not a valid directory.")
        _check_dependencies()
        to_dl = anno_files[:]
        if extra:
            to_dl += [extra_anno_files[x] for x in extra]
        _download_anno_files("https://s3.amazonaws.com/gemini-annotations",
                             to_dl, anno_dir, cur_config)
Example #2
0
def update_cosmic_census_genes(session, args):
    """
    Update the gene summary table with
    whether or not a given gene is in the
    COSMIC cancer gene census
    """
    config = read_gemini_config(args=args)
    path_dirname = config["annotation_dir"]
    file = os.path.join(path_dirname, "cancer_gene_census.20140120.tsv")

    cosmic_census_genes = []
    for line in open(file, "r"):
        fields = line.strip().split("\t")
        gene = fields[0]
        chrom = "chr" + fields[3]
        cosmic_census_genes.append((1, gene, chrom))
    database_cassandra.update_gene_summary_w_cancer_census(session, cosmic_census_genes)
Example #3
0
def update_cosmic_census_genes(session, args):
    """
    Update the gene summary table with
    whether or not a given gene is in the
    COSMIC cancer gene census
    """
    config = read_gemini_config(args=args)
    path_dirname = config["annotation_dir"]
    file = os.path.join(path_dirname, 'cancer_gene_census.20140120.tsv')

    cosmic_census_genes = []
    for line in open(file, 'r'):
        fields = line.strip().split("\t")
        gene = fields[0]
        chrom = "chr" + fields[3]
        cosmic_census_genes.append((1, gene, chrom))
    database_cassandra.update_gene_summary_w_cancer_census(
        session, cosmic_census_genes)
Example #4
0
def get_anno_files( args ):
    config = read_gemini_config( args = args )
    anno_dirname = config["annotation_dir"]
    # Default annotations -- always found
    annos = {
     'pfam_domain': os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'),
     'cytoband': os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'),
     'dbsnp': os.path.join(anno_dirname, 'dbsnp.b141.20140813.hg19.vcf.gz'),
     'clinvar': os.path.join(anno_dirname, 'clinvar_20140807.vcf.gz'),
     'gwas': os.path.join(anno_dirname, 'hg19.gwas.bed.gz'),
     'rmsk': os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'),
     'segdup': os.path.join(anno_dirname, 'hg19.segdup.bed.gz'),
     'conserved': os.path.join(anno_dirname, '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'),
     'cpg_island': os.path.join(anno_dirname, 'hg19.CpG.bed.gz'),
     'dgv': os.path.join(anno_dirname, 'hg19.dgv.bed.gz'),
     'esp': os.path.join(anno_dirname,
                         'ESP6500SI.all.snps_indels.vcf.gz'),
     '1000g': os.path.join(anno_dirname,
                           'ALL.autosomes.phase3_shapeit2_mvncall_integrated_v5.20130502.sites.vcf.gz'),
     'recomb': os.path.join(anno_dirname,
                            'genetic_map_HapMapII_GRCh37.gz'),
     'gms': os.path.join(anno_dirname,
                         'GRCh37-gms-mappability.vcf.gz'),
     'grc': os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'),
     'cse': os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"),
     'encode_tfbs': os.path.join(anno_dirname,
                                 'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'),
     'encode_dnase1': os.path.join(anno_dirname,
                                   'stam.125cells.dnaseI.hg19.bed.gz'),
     'encode_consensus_segs': os.path.join(anno_dirname,
                                           'encode.6celltypes.consensus.bedg.gz'),
     'gerp_elements': os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'),
     'vista_enhancers': os.path.join(anno_dirname, 'hg19.vista.enhancers.20131108.bed.gz'),
     'fitcons': os.path.join(anno_dirname, "hg19_fitcons_fc-i6-0_V1-01.bw"),
     'cosmic': os.path.join(anno_dirname, 'cosmic-v68-GRCh37.vcf.gz'),
     'exac': os.path.join(anno_dirname, 'ExAC.r0.2.sites.vep.vcf.gz')
    }
    # optional annotations
    if os.path.exists(os.path.join(anno_dirname, 'hg19.gerp.bw')):
        annos['gerp_bp'] = os.path.join(anno_dirname, 'hg19.gerp.bw')
    if os.path.exists(os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')):
        annos['cadd_score'] = os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')
    return annos
Example #5
0
#!/usr/bin/env python
"""
Store the path for GEMINI data-dir
"""

from geminicassandra.config import read_gemini_config

config = read_gemini_config()
anno_dirname = config["annotation_dir"]

print anno_dirname



Example #6
0
def get_anno_files(args):
    config = read_gemini_config(args=args)
    anno_dirname = config["annotation_dir"]
    # Default annotations -- always found
    annos = {
        'pfam_domain':
        os.path.join(anno_dirname, 'hg19.pfam.ucscgenes.bed.gz'),
        'cytoband':
        os.path.join(anno_dirname, 'hg19.cytoband.bed.gz'),
        'dbsnp':
        os.path.join(anno_dirname, 'dbsnp.b141.20140813.hg19.vcf.gz'),
        'clinvar':
        os.path.join(anno_dirname, 'clinvar_20140807.vcf.gz'),
        'gwas':
        os.path.join(anno_dirname, 'hg19.gwas.bed.gz'),
        'rmsk':
        os.path.join(anno_dirname, 'hg19.rmsk.bed.gz'),
        'segdup':
        os.path.join(anno_dirname, 'hg19.segdup.bed.gz'),
        'conserved':
        os.path.join(
            anno_dirname,
            '29way_pi_lods_elements_12mers.chr_specific.fdr_0.1_with_scores.txt.hg19.merged.bed.gz'
        ),
        'cpg_island':
        os.path.join(anno_dirname, 'hg19.CpG.bed.gz'),
        'dgv':
        os.path.join(anno_dirname, 'hg19.dgv.bed.gz'),
        'esp':
        os.path.join(anno_dirname, 'ESP6500SI.all.snps_indels.vcf.gz'),
        '1000g':
        os.path.join(
            anno_dirname,
            'ALL.autosomes.phase3_shapeit2_mvncall_integrated_v5.20130502.sites.vcf.gz'
        ),
        'recomb':
        os.path.join(anno_dirname, 'genetic_map_HapMapII_GRCh37.gz'),
        'gms':
        os.path.join(anno_dirname, 'GRCh37-gms-mappability.vcf.gz'),
        'grc':
        os.path.join(anno_dirname, 'GRC_patch_regions.bed.gz'),
        'cse':
        os.path.join(anno_dirname, "cse-hiseq-8_4-2013-02-20.bed.gz"),
        'encode_tfbs':
        os.path.join(anno_dirname,
                     'wgEncodeRegTfbsClusteredV2.cell_count.20130213.bed.gz'),
        'encode_dnase1':
        os.path.join(anno_dirname, 'stam.125cells.dnaseI.hg19.bed.gz'),
        'encode_consensus_segs':
        os.path.join(anno_dirname, 'encode.6celltypes.consensus.bedg.gz'),
        'gerp_elements':
        os.path.join(anno_dirname, 'hg19.gerp.elements.bed.gz'),
        'vista_enhancers':
        os.path.join(anno_dirname, 'hg19.vista.enhancers.20131108.bed.gz'),
        'fitcons':
        os.path.join(anno_dirname, "hg19_fitcons_fc-i6-0_V1-01.bw"),
        'cosmic':
        os.path.join(anno_dirname, 'cosmic-v68-GRCh37.vcf.gz'),
        'exac':
        os.path.join(anno_dirname, 'ExAC.r0.2.sites.vep.vcf.gz')
    }
    # optional annotations
    if os.path.exists(os.path.join(anno_dirname, 'hg19.gerp.bw')):
        annos['gerp_bp'] = os.path.join(anno_dirname, 'hg19.gerp.bw')
    if os.path.exists(
            os.path.join(anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')):
        annos['cadd_score'] = os.path.join(
            anno_dirname, 'whole_genome_SNVs.tsv.compressed.gz')
    return annos