def variant_discard_by_position(vcf_path, pos_path): info('Reading list of blacklisted positions...') pos_file = zopen(pos_path) blacklist = [] for line in pos_file: cols = line.rstrip().split('\t') if len(cols) < 2: continue chr = cols[0][3:] if cols[0].startswith('chr') else cols[0] blacklist.append(chr + ':' + cols[1]) blacklist = set(blacklist) vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('##'): break headers = line.rstrip().split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 sys.stdout.write(line) for line in vcf_file: cols = line.rstrip().split('\t') chr = cols[0][3:] if cols[0].startswith('chr') else cols[0] if not chr + ':' + cols[1] in blacklist: sys.stdout.write(line)
def variant_allele_fractions(vcf_path, pos_path): snps = set() for line in zopen(pos_path): pos = ':'.join(line[:-1].split('\t')[0:2]) if not pos.startswith('chr'): pos = 'chr' + pos snps.add(pos) vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('#'): break headers = line[:-1].split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 sys.stdout.write(line) for line in vcf_file: cols = line[:-1].split('\t') if not ':'.join(cols[0:2]) in snps: continue reads = [gt.split(':')[1:3] for gt in cols[sample_col:]] sys.stdout.write('\t'.join(cols[:sample_col])) for r in reads: alt, total = float(r[0]), int(r[1]) sys.stdout.write('\tNaN' if total == 0 else '\t%.2f' % (alt / total)) sys.stdout.write('\n')
def variant_heterozygous_bases(vcf_path, kgenomes_path): is_snp = bytearray(int(300e6)) for line in zopen(kgenomes_path): pos = int(line[:-1].split('\t')[1]) is_snp[pos] = True vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('#'): break headers = line[:-1].split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 print('\t'.join(headers[0:4])) for line in vcf_file: cols = line[:-1].split('\t') if not is_snp[int(cols[1])]: continue gt_cols = cols[sample_col:] genotypes = [gt_symbols.index(gt[:gt.find(':')]) for gt in gt_cols] total_reads = [float(gt.split(':')[2]) for gt in gt_cols] if not any(g == 2 and r >= 15 for g, r in zip(genotypes, total_reads)): continue print('\t'.join(cols[0:4]))
def swiss_annotate(input_path, bed_path): features = [] for line in zopen(bed_path): c = line.rstrip().split('\t') features.append((c[0], c[5], (int(c[1]), int(c[2])), c[3])) for line in zopen(input_path): t = line.rstrip().split('\t') chr, pos = None, None m = re.match(r'(chr.+):(\d+)', t[0]) if m: chr = m.group(1) pos = int(m.group(2)) if chr == None: sys.stdout.write(line) continue nearby = [] for f in features: if f[0] != chr: continue dist = distance(pos, f[2]) if dist < 50000: nearby.append((re.sub(' \(ENSG.*?\)', '', f[3]), dist)) nearby.sort(key=lambda x: x[1]) sys.stdout.write(line[:-1]) sys.stdout.write('\t') print(','.join(['%s (%d)' % f for f in nearby]))
def sam_reads_raw(bam_path, out_prefix): out_1 = zopen('%s_1.reads.gz' % out_prefix, 'w') out_2 = zopen('%s_2.reads.gz' % out_prefix, 'w') out = zopen('%s.reads.gz' % out_prefix, 'w') reads_1 = {} reads_2 = {} # The "samtools bam2fq" command does not output supplementary or # secondary alignments. Each read only has one primary alignment. options = '-n' if has_mate_suffixes(bam_path) else '' bam2fq = shell_stdout('samtools bam2fq %s %s' % (options, bam_path)) for line in bam2fq: if line[0] != '@': error('Invalid bam2fq output.') line = line[:-1] if line.endswith('/1'): segname = line[1:-2] mate = reads_2.pop(segname, None) if mate: out_1.write(next(bam2fq)) out_2.write('%s\n' % mate) else: reads_1[segname] = next(bam2fq)[:-1] elif line.endswith('/2'): segname = line[1:-2] mate = reads_1.pop(segname, None) if mate: out_1.write('%s\n' % mate) out_2.write(next(bam2fq)) else: reads_2[segname] = next(bam2fq)[:-1] else: out.write('%s\n' % next(bam2fq)[:-1]) # Skip per-base qualities. They can start with '@'. next(bam2fq) next(bam2fq) info('Found %d orphan first mates.' % len(reads_1)) for read_id in reads_1.keys()[:5]: info('- Example: %s' % read_id) info('Found %d orphan second mates.' % len(reads_2)) for read_id in reads_2.keys()[:5]: info('- Example: %s' % read_id) if len(reads_1) > 0: for read in reads_1.itervalues(): out.write('%s\n' % read) if len(reads_2) > 0: for read in reads_2.itervalues(): out.write('%s\n' % read) out_1.close() out_2.close() out.close()
def swiss_split_wig(wig_path, out_prefix, no_header=False): chrom_re = re.compile('chrom=(\w+)') chr = '' for line in zopen(wig_path): if line.startswith(('fixedS', 'variableS')): if chr: out.close() chr = chrom_re.search(line).group(1) out = zopen('%s_%s.wig.gz' % (out_prefix, chr), 'w') if not no_header: out.write(line) continue if chr: out.write(line)
def sam_reads(bam_path, out_prefix): fastq_1 = zopen('%s_1.fq.gz' % out_prefix, 'w') fastq_2 = zopen('%s_2.fq.gz' % out_prefix, 'w') fastq = zopen('%s.fq.gz' % out_prefix, 'w') reads_1 = {} reads_2 = {} # FIXME: We assume that each read only has one alignment in the BAM file. for al in read_sam(bam_path): flags = int(al[1]) if flags & 0x40: rname = al[0][:-2] if al[0].endswith('/1') else al[0] mate = reads_2.pop(rname, None) if mate: fastq_1.write('@%s/1\n%s\n+\n%s\n' % (rname, al[9], al[10])) fastq_2.write('@%s/2\n%s\n+\n%s\n' % (rname, mate[0], mate[1])) else: reads_1[rname] = (al[9], al[10]) elif flags & 0x80: rname = al[0][:-2] if al[0].endswith('/2') else al[0] mate = reads_1.pop(rname, None) if mate: fastq_1.write('@%s/1\n%s\n+\n%s\n' % (rname, mate[0], mate[1])) fastq_2.write('@%s/2\n%s\n+\n%s\n' % (rname, al[9], al[10])) else: reads_2[rname] = (al[9], al[10]) else: fastq.write('@%s\n%s\n+\n%s\n' % (al[0], al[9], al[10])) info('Found %d orphan first mates.' % len(reads_1)) for read_id in reads_1.keys()[:5]: info('- Example: %s' % read_id) info('Found %d orphan second mates.' % len(reads_2)) for read_id in reads_2.keys()[:5]: info('- Example: %s' % read_id) if len(reads_1) > 0: for rname, read in reads_1.iteritems(): fastq.write('@%s\n%s\n+\n%s\n' % (rname, read[0], read[1])) if len(reads_2) > 0: for rname, read in reads_2.iteritems(): fastq.write('@%s\n%s\n+\n%s\n' % (rname, read[0], read[1])) fastq_1.close() fastq_2.close() fastq.close()
def variant_annotate(vcf_path, genome='~/tools/annovar-2016-02-01/humandb/hg38'): format_annovar(vcf_path, 'anno_tmp.vcf') humandb_dir, genome_version = os.path.split(genome) shell('table_annovar.pl anno_tmp.vcf %s -buildver %s --remove --otherinfo ' '--outfile annotated -operation g,f,f,f ' '-protocol refGene,cosmic70,1000g2014oct_all,exac03' % (humandb_dir, genome_version)) anno = open('annotated.%s_multianno.txt' % genome_version) out = zopen('annotated.vcf.gz', 'w') anno.next() line = anno.next() headers = [ 'CHROM', 'POSITION', 'REF', 'ALT', 'FUNCTION', 'GENE', 'EXONIC_FUNCTION', 'AA_CHANGE', 'COSMIC', '1000G', 'EXAC' ] headers += line.rstrip('\n').split('\t')[20:] out.write('\t'.join(headers) + '\n') for line in anno: c = line.rstrip('\n').split('\t') out.write('\t'.join(c[0:2] + c[3:7] + c[8:13] + c[20:])) out.write('\n') out.close() os.remove('anno_tmp.vcf') os.remove('annotated.%s_multianno.txt' % genome_version) if num_lines('annotated.invalid_input') <= 1: os.remove('annotated.invalid_input') if num_lines('annotated.refGene.invalid_input') <= 1: os.remove('annotated.refGene.invalid_input')
def ensembl_transcript_bed(gtf_path): tx_id_to_gene = {} tx_exons = {} gtf_file = zopen(gtf_path) for line in gtf_file: if line.startswith('#'): continue c = line.rstrip('\n').split('\t') if not c[0] in human_chr: continue if not c[1] in accepted_gene_types: continue if c[2] != 'exon': continue chr, start, end, strand = c[0], int(c[3]), int(c[4]), c[6] if not chr.startswith('chr'): chr = 'chr' + chr tx_id = re.search(r'transcript_id "(.+?)"', line).group(1) gene_id = re.search(r'gene_id "(.+?)"', line).group(1) gene_name = re.search(r'gene_name "(.+?)"', line).group(1) exons = tx_exons.setdefault(tx_id, []) exons.append((chr, strand, start, end)) tx_id_to_gene[tx_id] = (gene_id, gene_name) for tx_id, exons in tx_exons.iteritems(): start, end = min(ex[2] for ex in exons), max(ex[3] for ex in exons) print('%s\t%d\t%d\t%s:%s:%s\t\t%s' % (exons[0][0], start - 1, end, tx_id_to_gene[tx_id][0], tx_id_to_gene[tx_id][1], tx_id, exons[0][1]))
def partition(samples_path, num_partitions): samples = [line.strip() for line in zopen(samples_path)] part_size = float(len(samples)) / num_partitions partition_ends = [int((p+1) * part_size) for p in range(num_partitions)] print(partition_ends) patient_ids = [] num_without_pid = 0 for s in samples: m = re.search('TCGA-..-....', s) if not m: num_without_pid += 1 patient_ids.append(m.group(0) if m else 'zzz' + s) if num_without_pid: info('WARNING: %d sample names did not contain a TCGA patient ID.' % num_without_pid) samples, patient_ids = zip(*sorted(zip(samples, patient_ids), key=lambda x: x[1])) partitions = [] for p in range(num_partitions): first = sum(len(p) for p in partitions) last = partition_ends[p] - 1 part = [s for s in samples[first:last+1]] while last + 1 < len(samples) and \ patient_ids[last+1] == patient_ids[last]: part.append(samples[last+1]) last += 1 partitions.append(part) for idx, part in enumerate(partitions): out = open('batch_%d.txt' % (idx+1), 'w') for s in part: out.write('%s\n' % s) out.close()
def read_fixed_wig(wig_path): tracks = {} chr = '' for line in zopen(wig_path): if line.startswith('fixedStep'): if chr: track.values = values[0:N] # Remove preallocated space track = Object() values = np.zeros(1000000) m = re.search(r'chrom=(\w+)', line) chr = m.group(1) m = re.search(r'start=(\d+)', line) track.start = int(m.group(1)) m = re.search(r'step=(\d+)', line) track.step = int(m.group(1)) m = re.search(r'span=(\d+)', line) track.span = int(m.group(1)) if m else -1 N = 0 tracks[chr] = track continue if chr: values[N] = float(line) N += 1 if chr: track.values = values[0:N] # Remove preallocated space return tracks
def coverage_cds(bam_path, gtf_path): chr_sizes = ref_sequence_sizes(bam_path) info('Constructing a map of coding regions...') coding = {} for chr, size in chr_sizes.iteritems(): coding[chr] = [False] * size for line in zopen(gtf_path): if line.startswith('#'): continue cols = line.split('\t') if cols[2] != 'CDS': continue if len(cols[0]) > 5: continue # Ignore chromosomes other than chrXX if not cols[0] in coding: continue coding[cols[0]][int(cols[3])-1:int(cols[4])] = True info('Calculating a coverage histogram...') coverage_hist = [0] * 200 chr = '' pos = 0 for line in shell_stdout('bedtools genomecov -d -split -ibam %s' % bam_path): cols = line.split('\t') if cols[0] != chr: chr = cols[0] cds = coding[chr] pos = int(cols[1])-2 info('%s...' % chr) pos += 1 if cds[pos]: coverage_hist[min(int(cols[2]), len(coverage_hist)-1)] += 1 print('Coverage histogram:') print('===================') for cov in range(0, len(coverage_hist)): print('%d: %d' % (cov, coverage_hist[cov]))
def variant_filter(vcf_path, nonsynonymous, no_1000g): vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('#'): break sys.stdout.write(line) headers = line[:-1].split('\t') if nonsynonymous and not 'EXONIC_FUNCTION' in headers: error('Cannot find exonic function column.') if no_1000g and not '1000G' in headers: error('Cannot find 1000 Genomes column.') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 col_1000g = headers.index('1000G') col_exonic_func = headers.index('EXONIC_FUNCTION') for line in vcf_file: cols = line[:-1].split('\t') if nonsynonymous: if not cols[col_exonic_func].startswith( ('nonsynonymous', 'frameshift', 'stopgain', 'stoploss', 'nonframeshift')): continue if no_1000g: if cols[col_1000g]: continue sys.stdout.write(line)
def discard_if_in_controls(vcf_path, control_samples, threshold): vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('##'): break headers = line.rstrip().split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 control = [ any(re.search(rx, s) for rx in control_samples) for s in headers[sample_col:] ] if not any(control): error('No control samples found.') info('Using these %d control samples:' % sum(control)) for s, c in zip(headers[sample_col:], control): if c: info('- %s' % s) sys.stdout.write(line) for line in vcf_file: cols = line.rstrip('\n').split('\t')[sample_col:] genotypes = [gt_symbols.index(c[:c.find(':')]) for c in cols] if sum(c and gt > 1 for c, gt in zip(control, genotypes)) >= threshold: continue sys.stdout.write(line)
def swiss_igv(tsv_path, data_col, one_based=True): tsv_file = zopen(tsv_path) headers = next(tsv_file)[:-1].split('\t') chrom_col = [ i for i, h in enumerate(headers[:data_col]) if re.match('chrom', h, re.I) ] if len(chrom_col) != 1: error('Cannot find chromosome column.') chrom_col = chrom_col[0] pos_col = [ i for i, h in enumerate(headers[:data_col]) if re.match('pos', h, re.I) ] if len(pos_col) != 1: error('Cannot find position column.') pos_col = pos_col[0] print('CHROMOSOME\tSTART\tEND\tFEATURE\t' + '\t'.join(headers[data_col:])) for line in tsv_file: tokens = line[:-1].split('\t') chr = tokens[chrom_col] pos = int(tokens[pos_col]) if one_based: pos -= 1 sys.stdout.write('%s\t%d\t%d\t-\t' % (chr, pos, pos + 1)) print('\t'.join(tokens[data_col:]))
def summarize(exon_expr_path): file = zopen(exon_expr_path) header = next(file) samples = header.rstrip('\n').split('\t')[4:] S = len(samples) features = {} for line in file: cols = line.rstrip('\n').split('\t') if len(cols) < S: continue chr = cols[0] start = int(cols[1]) + 1 end = int(cols[2]) expr = [float(x) for x in cols[4:]] f = features.setdefault(cols[3], MergedFeature(S)) f.expr = [a + b for a, b in zip(f.expr, expr)] f.total_len += end - start + 1 f.chromosome = chr if f.start == -1 or f.start > start: f.start = start if f.end == -1 or f.end < end: f.end = end print('CHROM\tSTART\tEND\tNAME\tLENGTH\t' + '\t'.join(samples)) for name, f in features.iteritems(): print('%s\t%d\t%d\t%s\t%d\t%s' % (f.chromosome, f.start, f.end, name, f.total_len, '\t'.join(str(e) for e in f.expr)))
def fasta_remove_adapters(fasta_path, adapter): # Convert the adapter into a regular expression if len(adapter) < 5: error('Adapter sequence is too short.') adapter_re = adapter[:5] for base in adapter[5:]: adapter_re += '(?:' + base adapter_re += (len(adapter) - 5) * ')?' adapter_re = re.compile(adapter_re) info('Adapter regular expression: %s' % adapter_re) fasta = zopen(fasta_path) for line in fasta: if line[0] == '#': sys.stdout.write(line) elif line[0] == '>': sys.stdout.write(line) seq = next(fasta)[:-1] m = adapter_re.search(seq) if m: seq = seq[:m.start()] print(seq) elif line[0] == '@': sys.stdout.write(line) seq = next(fasta)[:-1] m = adapter_re.search(seq) trim_len = m.start() if m else len(seq) print(seq[:trim_len]) sys.stdout.write(next(fasta)) print(next(fasta)[:trim_len])
def variant_conservation(vcf_path): vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('##'): break headers = line.rstrip().split('\t') sample_col = headers.index('ALT') + 1 samples = headers[sample_col:] chr = None Sd2 = math.ceil(len(samples) / 2.0) for line in vcf_file: cols = line.rstrip().split('\t') if cols[0] != chr: chr = cols[0] print('variableStep chrom=%s' % chr) genotypes = [ gt_symbols.index(gt[:gt.find(':')]) for gt in cols[sample_col:] ] if any(genotypes == 0): continue is_alt = (genotypes >= 2) conserved = max(sum(is_alt), sum(1 - is_alt)) conserved = (conserved - Sd2) / (len(samples) - Sd2) # -> [0,1] print('%s\t%.2f' % (cols[1], conserved))
def fasta_rna_to_dna(fasta_path): fasta = zopen(fasta_path) for line in fasta: if line[0] in '#>@+': sys.stdout.write(line) else: sys.stdout.write(line.upper().replace('U', 'T')) fasta.close()
def fasta_c2t(fasta_path): fasta = zopen(fasta_path) for line in fasta: sys.stdout.write(line) if line[0] in '>@': line = next(fasta) # Read sequence sys.stdout.write(line.upper().replace('C', 'T')) fasta.close()
def coverage_grid(genome_path, winsize, step): for line in zopen(genome_path): if not line.strip(): continue c = line.rstrip('\n').split('\t') chr, chr_len = c[0], int(c[1]) start = 1 while start + winsize < chr_len: print('%s\t%d\t%d' % (chr, start - 1, start + winsize - 1)) start += step
def fasta_flatten(fasta_path, output_dir): fasta = zopen(fasta_path) flat_file = None for line in fasta: if line[0] in ('>'): if flat_file: flat_file.close() flat_file = open(output_dir + '/' + line[1:].strip() + '.seq', 'w') else: flat_file.write(line.strip()) flat_file.close()
def fasta_trim(fasta_path, trim_len): fasta = zopen(fasta_path) for line in fasta: if line[0] in '@+>': sys.stdout.write(line) line = next(fasta) if len(line) - 1 > trim_len: print(line[:trim_len]) else: sys.stdout.write(line)
def ensembl_to_hugo(expr_path, gtf_path): translations = {} for line in zopen(gtf_path): m = re.search('gene_id "(.*?)";.*; gene_name "(.*?)"', line) if not m: continue translations[m.group(1)] = m.group(2) file = zopen(expr_path) header = next(file) headers = header[:-1].split('\t') feature_col = headers.index('NAME') sys.stdout.write(header) for line in file: cols = line[:-1].split('\t') translated = translations.get(cols[feature_col]) if translated: cols[feature_col] = translated + ':' + cols[feature_col] print('\t'.join(cols))
def ensembl_cleanup(gtf_path): valid_chr = set([ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT' ]) for line in zopen(gtf_path): chr = line[:line.find('\t')] if chr in valid_chr and not 'nonsense_mediated' in line: sys.stdout.write('chr') sys.stdout.write(line)
def filter_distance(sv_path, min_distance): for line in zopen(sv_path): if not line.startswith('chr'): sys.stdout.write(line) continue tokens = line[:-1].split('\t') if tokens[0] != tokens[5] or abs(int(tokens[2]) - int(tokens[7])) >= min_distance: sys.stdout.write(line)
def variant_heterozygous_concordance(vcf_path, kgenomes_path, test_rx, ref_rx): is_snp = np.zeros(300 * 1000 * 1000, np.bool_) for line in zopen(kgenomes_path): pos = int(line[:-1].split('\t')[1]) is_snp[pos] = True vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('#'): break headers = line[:-1].split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 test_col = [ i for i, h in headers if re.search(test_rx, h) and i >= sample_col ] ref_col = [ i for i, h in headers if re.search(ref_rx, h) and i >= sample_col ] if len(test_col) != 1: error('Test sample not found.') if len(ref_col) != 1: error('Reference sample not found.') total_hetz_in_ref = 0 total_concordant = 0 for line in vcf_file: cols = line[:-1].split('\t') if not is_snp[int(cols[1])]: continue test = cols[test_col] test_gt = gt_symbols.index(test[:test.find(':')]) ref = cols[ref_col] ref_gt = gt_symbols.index(ref[:ref.find(':')]) if ref_gt == 2: total_hetz_in_ref += 1 total_concordant += (test_gt == 2) print('Concordance was %.1f%% (%d / %d).' % (float(total_concordant) / total_hetz_in_ref * 100, total_concordant, total_hetz_in_ref))
def fasta_split(fasta_path, tag_length, anchors_5p_path, anchors_3p_path): fasta = zopen(fasta_path, 'r') tags_5p = zopen(anchors_5p_path, 'w') tags_3p = zopen(anchors_3p_path, 'w') R = 0 for line in fasta: if line[0] == '+': next(fasta) continue if line[0] in '>@#': continue R += 1 if len(line) <= 2 * tag_length: continue tags_5p.write('>%d_%s/1\n%s\n' % (R, line[:-1], line[0:tag_length])) tags_3p.write('>%d_%s/2\n%s\n' % (R, line[:-1], line[-tag_length - 1:-1])) tags_5p.close() tags_3p.close()
def variant_statistics(vcf_path): vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('#'): break headers = line.rstrip('\n').split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 samples = headers[sample_col:] nearby_gene_col = headers.index('NEARBY_GENES') \ if 'NEARBY_GENES' in headers else None mutations_per_sample = np.zeros(len(samples)) mutations_per_chr = defaultdict(lambda: np.zeros(len(samples))) mutations_per_gene = defaultdict(lambda: np.zeros(len(samples))) for line in vcf_file: cols = line[:-1].split('\t') gtypes = [gt.split(':')[0] for gt in cols[sample_col:]] gtypes = np.array([gt_symbols.index(gt) for gt in gtypes]) mutations_per_sample += (gtypes > 1) mutations_per_chr[cols[0]] += (gtypes > 1) if nearby_gene_col: for nearby in cols[nearby_gene_col].split(','): mutations_per_gene[nearby] += (gtypes > 1) print('Sample mutation counts:') for s, sample_name in enumerate(samples): print('%s: %d' % (sample_name, mutations_per_sample[s])) print('Mutations per chromosome:') chrs = natural_sorted(mutations_per_chr.keys()) print('SAMPLE\t%s' % '\t'.join(chrs)) for s, sample_name in enumerate(samples): total = sum(mutations_per_chr[chr][s] for chr in chrs) if total == 0: continue sys.stdout.write(sample_name) for chr in chrs: sys.stdout.write('\t%d (%.1f)' % (mutations_per_chr[chr][s], float(mutations_per_chr[chr][s]) / total * 100)) sys.stdout.write('\n') print('Top mutated genes:') top_genes = sorted(mutations_per_gene.iteritems(), key=lambda x: sum(x[1] > 0), reverse=True) for top in top_genes[0:100]: mut_samples = sum(top[1] > 0) if mut_samples < 2: continue print('%s\t%d samples' % (top[0], mut_samples))
def annotate_variants(sv_path, bed_path): features = [] bed_file = zopen(bed_path) for line in bed_file: c = line.rstrip().split('\t') features.append((c[0], c[5], (int(c[1]), int(c[2])), c[3])) print(sv_file_header) sv_file = zopen(sv_path) for line in sv_file: if not line.startswith('chr'): continue tokens = line[:-1].split('\t') chr_1 = tokens[0] strand_1 = tokens[1] pos_1 = int(tokens[2]) chr_2 = tokens[5] strand_2 = tokens[6] pos_2 = int(tokens[7]) nearby_features_1 = [(re.sub(' \(ENSG.*?\)', '', f[3]), distance_to_gene(pos_1, f[2])) for f in features if f[0] == chr_1] nearby_features_2 = [(re.sub(' \(ENSG.*?\)', '', f[3]), distance_to_gene(pos_2, f[2])) for f in features if f[0] == chr_2] nearby_features_1 = [f for f in nearby_features_1 if f[1] < 100000] nearby_features_2 = [f for f in nearby_features_2 if f[1] < 100000] nearby_features_1.sort(key=lambda x: x[1]) nearby_features_2.sort(key=lambda x: x[1]) tokens[3] = ', '.join(['%s (%d)' % f for f in nearby_features_1]) tokens[8] = ', '.join(['%s (%d)' % f for f in nearby_features_2]) print('%s' % '\t'.join(tokens)) sv_file.close()