def get_genes_from_gencode_gtf(gtf_file): """ Parse gencode GTF file; Returns iter of gene dicts """ for line in gtf_file: if line.startswith('#'): continue fields = line.strip('\n').split('\t') if fields[2] != 'gene': continue chrom = fields[0][3:] start = int(fields[3]) + 1 # bed files are 0-indexed stop = int(fields[4]) + 1 info = dict(x.strip().split() for x in fields[8].split(';') if x != '') info = {k: v.strip('"') for k, v in info.items()} gene_id = info['gene_id'].split('.')[0] gene = { 'gene_id': gene_id, 'gene_name': info['gene_name'], 'gene_name_upper': info['gene_name'].upper(), 'chrom': chrom, 'start': start, 'stop': stop, 'strand': fields[6], 'xstart': xbrowse.get_xpos(chrom, start), 'xstop': xbrowse.get_xpos(chrom, stop), } yield gene
def get_genes_from_gencode_gtf(gtf_file): """ Parse gencode GTF file; Returns iter of gene dicts """ for line in gtf_file: if line.startswith('#'): continue fields = line.strip('\n').split('\t') if fields[2] != 'gene': continue chrom = fields[0][3:] start = int(fields[3]) + 1 # bed files are 0-indexed stop = int(fields[4]) + 1 info = dict(x.strip().split() for x in fields[8].split(';') if x != '') info = {k: v.strip('"') for k, v in info.items()} gene_id = info['gene_id'].split('.')[0] gene = { 'gene_id': gene_id, 'gene_name': info['gene_name'], 'chrom': chrom, 'start': start, 'stop': stop, 'strand': fields[6], 'xstart': xbrowse.get_xpos(chrom, start), 'xstop': xbrowse.get_xpos(chrom, stop), } yield gene
def get_exons_from_gencode_gtf(gtf_file): """ Parse gencode GTF file; Returns iter of transcript dicts """ for line in gtf_file: if line.startswith('#'): continue fields = line.strip('\n').split('\t') if fields[2] not in ['exon', 'CDS', 'UTR']: continue chrom = fields[0][3:] feature_type = fields[2] start = int(fields[3]) + 1 # bed files are 0-indexed stop = int(fields[4]) + 1 info = dict(x.strip().split() for x in fields[8].split(';') if x != '') info = {k: v.strip('"') for k, v in info.items()} transcript_id = info['transcript_id'].split('.')[0] gene_id = info['gene_id'].split('.')[0] exon = { 'feature_type': feature_type, 'transcript_id': transcript_id, 'gene_id': gene_id, 'chrom': chrom, 'start': start, 'stop': stop, 'strand': fields[6], 'xstart': xbrowse.get_xpos(chrom, start), 'xstop': xbrowse.get_xpos(chrom, stop), } yield exon
def get_genes_in_region(db, chrom, start, stop): """ Genes that overlap a region """ xstart = get_xpos(chrom, start) xstop = get_xpos(chrom, stop) genes = db.genes.find({ 'xstart': {'$lte': xstop}, 'xstop': {'$gte': xstart}, }, fields={'_id': False}) return list(genes)
def get_variants_in_region(db, chrom, start, stop): """ Variants that overlap a region Unclear if this will include CNVs """ xstart = get_xpos(chrom, start) xstop = get_xpos(chrom, stop) variants = db.variants.find({ 'xstart': {'$lte': xstop}, # start of variant should be before (or equal to) end of region 'xstop': {'$gte': xstart}, # opposite of above }, fields={'_id': False}, limit=SEARCH_LIMIT) return list(variants)
def get_variants_in_region(db, chrom, start, stop): """ Variants that overlap a region Unclear if this will include CNVs """ xstart = get_xpos(chrom, start) xstop = get_xpos(chrom, stop) variants = list(db.variants.find({ 'xpos': {'$lte': xstop, '$gte': xstart} }, fields={'_id': False}, limit=SEARCH_LIMIT)) add_consequence_to_variants(variants) return list(variants)
def get_base_coverage_from_file(base_coverage_file): """ Read a base coverage file and return iter of dicts that look like: { 'xpos': 1e9+1, 'mean': 0.0, 'median': 0.0, '1': 0.0, '5': 0.0, '10': 0.0, '15': 0.0, '20': 0.0, '25': 0.0, '30': 0.0, '50': 0.0, '100': 0.0, } """ float_header_fields = ['mean', 'median', '1', '5', '10', '15', '20', '25', '30', '50', '100'] for line in base_coverage_file: if line.startswith('#'): continue fields = line.strip('\n').split('\t') d = { 'xpos': xbrowse.get_xpos(fields[0], int(fields[1])), 'pos': int(fields[1]), } for i, k in enumerate(float_header_fields): d[k] = float(fields[i+2]) yield d
def get_base_coverage_from_file(base_coverage_file): """ Read a base coverage file and return iter of dicts that look like: { 'xpos': 1e9+1, 'mean': 0.0, 'median': 0.0, '1': 0.0, '5': 0.0, '10': 0.0, '15': 0.0, '20': 0.0, '25': 0.0, '30': 0.0, '50': 0.0, '100': 0.0, } """ float_header_fields = [ 'mean', 'median', '1', '5', '10', '15', '20', '25', '30', '50', '100' ] for line in base_coverage_file: if line.startswith('#'): continue fields = line.strip('\n').split('\t') d = { 'xpos': xbrowse.get_xpos(fields[0], int(fields[1])), 'pos': int(fields[1]), } for i, k in enumerate(float_header_fields): d[k] = float(fields[i + 2]) yield d
def get_snp_from_dbsnp_file(dbsnp_file): for line in dbsnp_file: fields = line.split('\t') rsid = int(fields[0]) chrom = fields[1].rstrip('T') if chrom == 'PAR': continue start = int(fields[2]) + 1 snp = {'xpos': xbrowse.get_xpos(chrom, start), 'rsid': rsid} yield snp
def region_page(region_id): db = get_db() try: region = region_id.split('-') cache_key = 't-region-{}'.format(region_id) t = cache.get(cache_key) if t is None: chrom = region[0] start = None stop = None if len(region) == 3: chrom, start, stop = region start = int(start) stop = int(stop) if start is None or stop - start > REGION_LIMIT: return render_template( 'region.html', genes_in_region=None, variants_in_region=None, chrom=chrom, start=start, stop=stop, coverage=None ) genes_in_region = lookups.get_genes_in_region(db, chrom, start, stop) variants_in_region = lookups.get_variants_in_region(db, chrom, start, stop) xstart = xbrowse.get_xpos(chrom, start) xstop = xbrowse.get_xpos(chrom, stop) coverage_array = lookups.get_coverage_for_bases(db, xstart, xstop) t = render_template( 'region.html', genes_in_region=genes_in_region, variants_in_region=variants_in_region, chrom=chrom, start=start, stop=stop, coverage=coverage_array ) print 'Rendering region: %s' % region_id return t except Exception, e: print 'Failed on region:', region_id, ';Error=', e abort(404)
def region_page(region_id): db = get_db() try: region = region_id.split('-') cache_key = 't-region-{}'.format(region_id) t = cache.get(cache_key) if t is None: chrom = region[0] start = None stop = None if len(region) == 3: chrom, start, stop = region start = int(start) stop = int(stop) if start is None or stop - start > REGION_LIMIT: return render_template('region.html', genes_in_region=None, variants_in_region=None, chrom=chrom, start=start, stop=stop, coverage=None) genes_in_region = lookups.get_genes_in_region( db, chrom, start, stop) variants_in_region = lookups.get_variants_in_region( db, chrom, start, stop) xstart = xbrowse.get_xpos(chrom, start) xstop = xbrowse.get_xpos(chrom, stop) coverage_array = lookups.get_coverage_for_bases(db, xstart, xstop) t = render_template('region.html', genes_in_region=genes_in_region, variants_in_region=variants_in_region, chrom=chrom, start=start, stop=stop, coverage=coverage_array) print 'Rendering region: %s' % region_id return t except Exception, e: print 'Failed on region:', region_id, ';Error=', e abort(404)
def get_snp_from_dbsnp_file(dbsnp_file): for line in dbsnp_file: fields = line.split('\t') rsid = int(fields[0]) chrom = fields[1].rstrip('T') if chrom == 'PAR': continue start = int(fields[2]) + 1 snp = { 'xpos': xbrowse.get_xpos(chrom, start), 'rsid': rsid } yield snp
def variant_page(variant_str): db = get_db() try: chrom, pos, ref, alt = variant_str.split('-') pos = int(pos) # pos, ref, alt = get_minimal_representation(pos, ref, alt) xpos = xbrowse.get_xpos(chrom, pos) variant = lookups.get_variant(db, xpos, ref, alt) if variant is None: variant = { 'chrom': chrom, 'pos': pos, 'xpos': xpos, 'ref': ref, 'alt': alt } consequences = None ordered_csqs = None if 'vep_annotations' in variant: variant['vep_annotations'] = order_vep_by_csq( variant['vep_annotations']) # Adds major_consequence ordered_csqs = [ x['major_consequence'] for x in variant['vep_annotations'] ] ordered_csqs = reduce( lambda x, y: ','.join([x, y]) if y not in x else x, ordered_csqs, '').split(',') # Close but not quite there consequences = defaultdict(lambda: defaultdict(list)) for annotation in variant['vep_annotations']: annotation['HGVS'] = get_proper_hgvs(annotation) consequences[annotation['major_consequence']][ annotation['Gene']].append(annotation) base_coverage = lookups.get_coverage_for_bases(db, xpos, xpos + len(ref) - 1) any_covered = any([x['has_coverage'] for x in base_coverage]) metrics = lookups.get_metrics(db, variant) print 'Rendering variant: %s' % variant_str return render_template('variant.html', variant=variant, base_coverage=base_coverage, consequences=consequences, any_covered=any_covered, ordered_csqs=ordered_csqs, metrics=metrics) except Exception, e: print 'Failed on variant:', variant_str, '; Error=', traceback.format_exc( ) abort(404)
def variant_page(variant_str): db = get_db() try: chrom, pos, ref, alt = variant_str.split('-') pos = int(pos) # pos, ref, alt = get_minimal_representation(pos, ref, alt) xpos = xbrowse.get_xpos(chrom, pos) variant = lookups.get_variant(db, xpos, ref, alt) if variant is None: variant = { 'chrom': chrom, 'pos': pos, 'xpos': xpos, 'ref': ref, 'alt': alt } consequences = None ordered_csqs = None if 'vep_annotations' in variant: variant['vep_annotations'] = order_vep_by_csq(variant['vep_annotations']) # Adds major_consequence ordered_csqs = [x['major_consequence'] for x in variant['vep_annotations']] ordered_csqs = reduce(lambda x, y: ','.join([x, y]) if y not in x else x, ordered_csqs, '').split(',') # Close but not quite there consequences = defaultdict(lambda: defaultdict(list)) for annotation in variant['vep_annotations']: annotation['HGVS'] = get_proper_hgvs(annotation) consequences[annotation['major_consequence']][annotation['Gene']].append(annotation) base_coverage = lookups.get_coverage_for_bases(db, xpos, xpos + len(ref) - 1) any_covered = any([x['has_coverage'] for x in base_coverage]) metrics = lookups.get_metrics(db, variant) print 'Rendering variant: %s' % variant_str return render_template( 'variant.html', variant=variant, base_coverage=base_coverage, consequences=consequences, any_covered=any_covered, ordered_csqs=ordered_csqs, metrics=metrics ) except Exception, e: print 'Failed on variant:', variant_str, '; Error=', traceback.format_exc() abort(404)
def get_variants_from_sites_vcf(sites_vcf): """ Parse exac sites VCF file and return iter of variant dicts sites_vcf is a file (gzipped), not file path """ vep_field_names = None for line in sites_vcf: line = line.strip('\n') if line.startswith('##INFO=<ID=CSQ'): vep_field_names = line.split('Format: ')[-1].strip('">').split('|') if line.startswith('#'): continue # If we get here, it's a variant line # This elegant parsing code below is copied from https://github.com/konradjk/loftee fields = line.split('\t') info_field = dict([(x.split('=', 1)) if '=' in x else (x, x) for x in re.split(';(?=\w)', fields[7])]) consequence_array = info_field['CSQ'].split(',') if 'CSQ' in info_field else [] annotations = [dict(zip(vep_field_names, x.split('|'))) for x in consequence_array if len(vep_field_names) == len(x.split('|'))] coding_annotations = [ann for ann in annotations if ann['Feature'].startswith('ENST')] alt_alleles = fields[4].split(',') # different variant for each alt allele for i, alt_allele in enumerate(alt_alleles): vep_annotations = [ann for ann in coding_annotations if int(ann['ALLELE_NUM']) == i + 1] # Variant is just a dict # Make a copy of the info_field dict - so all the original data remains # Add some new keys that are allele-specific pos, ref, alt = get_minimal_representation(fields[1], fields[3], alt_allele) variant = {} variant['chrom'] = fields[0] variant['pos'] = pos variant['rsid'] = fields[2] variant['xpos'] = xbrowse.get_xpos(variant['chrom'], variant['pos']) variant['ref'] = ref variant['alt'] = alt variant['xstart'] = variant['xpos'] variant['xstop'] = variant['xpos'] + len(variant['alt']) - len(variant['ref']) variant['variant_id'] = '{}-{}-{}-{}'.format(variant['chrom'], variant['pos'], variant['ref'], variant['alt']) variant['orig_alt_alleles'] = [ '{}-{}-{}-{}'.format(variant['chrom'], *get_minimal_representation(fields[1], fields[3], x)) for x in alt_alleles ] variant['site_quality'] = float(fields[5]) variant['filter'] = fields[6] variant['vep_annotations'] = vep_annotations variant['allele_count'] = int(info_field['AC_Adj'].split(',')[i]) if not variant['allele_count'] and variant['filter'] == 'PASS': variant['filter'] = 'AC_Adj0' # Temporary filter variant['allele_num'] = int(info_field['AN_Adj']) if variant['allele_num'] > 0: variant['allele_freq'] = variant['allele_count']/float(info_field['AN_Adj']) else: variant['allele_freq'] = None variant['pop_acs'] = dict([(POPS[x], int(info_field['AC_%s' % x].split(',')[i])) for x in POPS]) variant['pop_ans'] = dict([(POPS[x], int(info_field['AN_%s' % x])) for x in POPS]) variant['pop_homs'] = dict([(POPS[x], int(info_field['Hom_%s' % x].split(',')[i])) for x in POPS]) variant['pop_acs']['Other'] = int(info_field['AC_Adj'].split(',')[i]) - sum(variant['pop_acs'].values()) variant['pop_ans']['Other'] = int(info_field['AN_Adj']) - sum(variant['pop_ans'].values()) variant['pop_homs']['Other'] = int(info_field['AC_Hom']) - sum(variant['pop_homs'].values()) variant['genes'] = list({annotation['Gene'] for annotation in vep_annotations}) variant['transcripts'] = list({annotation['Feature'] for annotation in vep_annotations}) if 'DP_MID' in info_field: mids_all = info_field['DP_MID'].split(',')[0] hists_all = info_field['DP_HIST'].split(',')[0] mids = info_field['DP_MID'].split(',')[i+1] hists = info_field['DP_HIST'].split(',')[i+1] variant['genotype_depths'] = [zip(map(float, mids_all.split('|')), map(int, hists_all.split('|'))), zip(map(float, mids.split('|')), map(int, hists.split('|')))] if 'GQ_MID' in info_field: mids_all = info_field['GQ_MID'].split(',')[0] hists_all = info_field['GQ_HIST'].split(',')[0] mids = info_field['GQ_MID'].split(',')[i+1] hists = info_field['GQ_HIST'].split(',')[i+1] variant['genotype_qualities'] = [zip(map(float, mids_all.split('|')), map(int, hists_all.split('|'))), zip(map(float, mids.split('|')), map(int, hists.split('|')))] yield variant
def get_variants_from_sites_vcf(sites_vcf): """ Parse exac sites VCF file and return iter of variant dicts sites_vcf is a file (gzipped), not file path """ vep_field_names = None for line in sites_vcf: try: line = line.strip('\n') if line.startswith('##INFO=<ID=CSQ'): vep_field_names = line.split('Format: ')[-1].strip('">').split( '|') if line.startswith('##INFO=<ID=DP_HIST'): dp_mids = map(float, line.split('Mids: ')[-1].strip('">').split('|')) if line.startswith('##INFO=<ID=GQ_HIST'): gq_mids = map(float, line.split('Mids: ')[-1].strip('">').split('|')) if line.startswith('#'): continue # If we get here, it's a variant line if vep_field_names is None: raise Exception( "VEP_field_names is None. Make sure VCF header is present." ) # This elegant parsing code below is copied from https://github.com/konradjk/loftee fields = line.split('\t') info_field = dict([(x.split('=', 1)) if '=' in x else (x, x) for x in re.split(';(?=\w)', fields[7])]) consequence_array = info_field['CSQ'].split( ',') if 'CSQ' in info_field else [] annotations = [ dict(zip(vep_field_names, x.split('|'))) for x in consequence_array if len(vep_field_names) == len(x.split('|')) ] coding_annotations = [ ann for ann in annotations if ann['Feature'].startswith('ENST') ] alt_alleles = fields[4].split(',') # different variant for each alt allele for i, alt_allele in enumerate(alt_alleles): vep_annotations = [ ann for ann in coding_annotations if int(ann['ALLELE_NUM']) == i + 1 ] # Variant is just a dict # Make a copy of the info_field dict - so all the original data remains # Add some new keys that are allele-specific pos, ref, alt = get_minimal_representation( fields[1], fields[3], alt_allele) variant = {} variant['chrom'] = fields[0] variant['pos'] = pos variant['rsid'] = fields[2] variant['xpos'] = xbrowse.get_xpos(variant['chrom'], variant['pos']) variant['ref'] = ref variant['alt'] = alt variant['xstart'] = variant['xpos'] variant['xstop'] = variant['xpos'] + len(variant['alt']) - len( variant['ref']) variant['variant_id'] = '{}-{}-{}-{}'.format( variant['chrom'], variant['pos'], variant['ref'], variant['alt']) variant['orig_alt_alleles'] = [ '{}-{}-{}-{}'.format( variant['chrom'], *get_minimal_representation(fields[1], fields[3], x)) for x in alt_alleles ] variant['site_quality'] = float(fields[5]) variant['filter'] = fields[6] variant['vep_annotations'] = vep_annotations variant['allele_count'] = int( info_field['AC_Adj'].split(',')[i]) if not variant['allele_count'] and variant['filter'] == 'PASS': variant['filter'] = 'AC_Adj0' # Temporary filter variant['allele_num'] = int(info_field['AN_Adj']) if variant['allele_num'] > 0: variant['allele_freq'] = variant['allele_count'] / float( info_field['AN_Adj']) else: variant['allele_freq'] = None variant['pop_acs'] = dict([ (POPS[x], int(info_field['AC_%s' % x].split(',')[i])) for x in POPS ]) variant['pop_ans'] = dict([ (POPS[x], int(info_field['AN_%s' % x])) for x in POPS ]) variant['pop_homs'] = dict([ (POPS[x], int(info_field['Hom_%s' % x].split(',')[i])) for x in POPS ]) variant['hom_count'] = sum(variant['pop_homs'].values()) if variant['chrom'] in ('X', 'Y'): variant['pop_hemis'] = dict([ (POPS[x], int(info_field['Hemi_%s' % x].split(',')[i])) for x in POPS ]) variant['hemi_count'] = sum(variant['pop_hemis'].values()) variant['quality_metrics'] = dict([(x, info_field[x]) for x in METRICS if x in info_field]) variant['genes'] = list( {annotation['Gene'] for annotation in vep_annotations}) variant['transcripts'] = list( {annotation['Feature'] for annotation in vep_annotations}) if 'DP_HIST' in info_field: hists_all = [ info_field['DP_HIST'].split(',')[0], info_field['DP_HIST'].split(',')[i + 1] ] variant['genotype_depths'] = [ zip(dp_mids, map(int, x.split('|'))) for x in hists_all ] if 'GQ_HIST' in info_field: hists_all = [ info_field['GQ_HIST'].split(',')[0], info_field['GQ_HIST'].split(',')[i + 1] ] variant['genotype_qualities'] = [ zip(gq_mids, map(int, x.split('|'))) for x in hists_all ] yield variant except Exception: print("Error parsing vcf line: " + line) traceback.print_exc() break
parser = argparse.ArgumentParser(description=description) parser.add_argument('frq') args = parser.parse_args() filename = args.frq if not os.path.exists(filename): raise Exception('File does not exist') if '.' not in filename: raise Exception('Filename must have an extension.') out_filename = filename + '.xbrowse.freqs' outfile = open(out_filename, 'w') for line in open(filename): if line.startswith('CHROM'): continue fields = line.strip('\n').split('\t') xpos = get_xpos(fields[0], int(fields[1])) allele_af = {} for field in fields[4:]: allele, af = field.split(':') allele_af[allele] = float(af) ref_allele = max(allele_af, key=allele_af.get) for allele, af in allele_af.items(): if allele != ref_allele: outfile.write('\t'.join([ str(xpos), ref_allele, allele, str(af) ])+'\n') outfile.close()