def variant_filter(vcf_path, nonsynonymous, no_1000g): vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('#'): break sys.stdout.write(line) headers = line[:-1].split('\t') if nonsynonymous and not 'EXONIC_FUNCTION' in headers: error('Cannot find exonic function column.') if no_1000g and not '1000G' in headers: error('Cannot find 1000 Genomes column.') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 col_1000g = headers.index('1000G') col_exonic_func = headers.index('EXONIC_FUNCTION') for line in vcf_file: cols = line[:-1].split('\t') if nonsynonymous: if not cols[col_exonic_func].startswith( ('nonsynonymous', 'frameshift', 'stopgain', 'stoploss', 'nonframeshift')): continue if no_1000g: if cols[col_1000g]: continue sys.stdout.write(line)
def swiss_igv(tsv_path, data_col, one_based=True): tsv_file = zopen(tsv_path) headers = next(tsv_file)[:-1].split('\t') chrom_col = [ i for i, h in enumerate(headers[:data_col]) if re.match('chrom', h, re.I) ] if len(chrom_col) != 1: error('Cannot find chromosome column.') chrom_col = chrom_col[0] pos_col = [ i for i, h in enumerate(headers[:data_col]) if re.match('pos', h, re.I) ] if len(pos_col) != 1: error('Cannot find position column.') pos_col = pos_col[0] print('CHROMOSOME\tSTART\tEND\tFEATURE\t' + '\t'.join(headers[data_col:])) for line in tsv_file: tokens = line[:-1].split('\t') chr = tokens[chrom_col] pos = int(tokens[pos_col]) if one_based: pos -= 1 sys.stdout.write('%s\t%d\t%d\t-\t' % (chr, pos, pos + 1)) print('\t'.join(tokens[data_col:]))
def discard_if_in_controls(vcf_path, control_samples, threshold): vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('##'): break headers = line.rstrip().split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 control = [ any(re.search(rx, s) for rx in control_samples) for s in headers[sample_col:] ] if not any(control): error('No control samples found.') info('Using these %d control samples:' % sum(control)) for s, c in zip(headers[sample_col:], control): if c: info('- %s' % s) sys.stdout.write(line) for line in vcf_file: cols = line.rstrip('\n').split('\t')[sample_col:] genotypes = [gt_symbols.index(c[:c.find(':')]) for c in cols] if sum(c and gt > 1 for c, gt in zip(control, genotypes)) >= threshold: continue sys.stdout.write(line)
def fasta_remove_adapters(fasta_path, adapter): # Convert the adapter into a regular expression if len(adapter) < 5: error('Adapter sequence is too short.') adapter_re = adapter[:5] for base in adapter[5:]: adapter_re += '(?:' + base adapter_re += (len(adapter) - 5) * ')?' adapter_re = re.compile(adapter_re) info('Adapter regular expression: %s' % adapter_re) fasta = zopen(fasta_path) for line in fasta: if line[0] == '#': sys.stdout.write(line) elif line[0] == '>': sys.stdout.write(line) seq = next(fasta)[:-1] m = adapter_re.search(seq) if m: seq = seq[:m.start()] print(seq) elif line[0] == '@': sys.stdout.write(line) seq = next(fasta)[:-1] m = adapter_re.search(seq) trim_len = m.start() if m else len(seq) print(seq[:trim_len]) sys.stdout.write(next(fasta)) print(next(fasta)[:trim_len])
def swiss_download_sra(sra_study): if not sra_study.startswith('SRP'): error('SRA study identifier must begin with "SRP".') shell('/data/csb/tools/ncftp-3.2.5/bin/ncftpget -R -v ' 'ftp-trace.ncbi.nlm.nih.gov ./ ' '/sra/sra-instant/reads/ByStudy/sra/SRP/%s/%s' % (sra_study[:6], sra_study))
def sam_reads_raw(bam_path, out_prefix): out_1 = zopen('%s_1.reads.gz' % out_prefix, 'w') out_2 = zopen('%s_2.reads.gz' % out_prefix, 'w') out = zopen('%s.reads.gz' % out_prefix, 'w') reads_1 = {} reads_2 = {} # The "samtools bam2fq" command does not output supplementary or # secondary alignments. Each read only has one primary alignment. options = '-n' if has_mate_suffixes(bam_path) else '' bam2fq = shell_stdout('samtools bam2fq %s %s' % (options, bam_path)) for line in bam2fq: if line[0] != '@': error('Invalid bam2fq output.') line = line[:-1] if line.endswith('/1'): segname = line[1:-2] mate = reads_2.pop(segname, None) if mate: out_1.write(next(bam2fq)) out_2.write('%s\n' % mate) else: reads_1[segname] = next(bam2fq)[:-1] elif line.endswith('/2'): segname = line[1:-2] mate = reads_1.pop(segname, None) if mate: out_1.write('%s\n' % mate) out_2.write(next(bam2fq)) else: reads_2[segname] = next(bam2fq)[:-1] else: out.write('%s\n' % next(bam2fq)[:-1]) # Skip per-base qualities. They can start with '@'. next(bam2fq) next(bam2fq) info('Found %d orphan first mates.' % len(reads_1)) for read_id in reads_1.keys()[:5]: info('- Example: %s' % read_id) info('Found %d orphan second mates.' % len(reads_2)) for read_id in reads_2.keys()[:5]: info('- Example: %s' % read_id) if len(reads_1) > 0: for read in reads_1.itervalues(): out.write('%s\n' % read) if len(reads_2) > 0: for read in reads_2.itervalues(): out.write('%s\n' % read) out_1.close() out_2.close() out.close()
def read_length(bam_path): read_lens = [] for al in read_sam(bam_path, 'a'): if len(read_lens) >= 100: break read_lens.append(len(al[9])) if len(set(read_lens)) > 1: error('SAM file contains reads of varying length.') else: return read_lens[0]
def filter_by_region(sv_path, region): m = re.match(r'(chr.+): *(\d+) *- *(\d+)', region.strip()) if not m: error('Invalid region specified.') chr = m.group(1) start = int(m.group(2)) end = int(m.group(3)) for line in zopen(sv_path): if not line.startswith('chr'): sys.stdout.write(line) continue c = line.rstrip().split('\t') if not chr in (c[0], c[5]): continue if (start <= int(c[2]) <= end) or (start <= int(c[7]) <= end): sys.stdout.write(line)
def variant_merge(vcf_paths): sort_in, sort_out = shell_stdinout('sort -k2,2 -k3,3n -k4,4 -k5,5') cons_headers = [] # Consensus headers vcf_samples = [] # Sample names of each VCF for vcf_index, vcf_path in enumerate(vcf_paths): info('Merging VCF file %s...' % vcf_path) vcf = zopen(vcf_path) for line in vcf: if not line.startswith('#'): break headers = line.rstrip('\n').split('\t') gtype_col = (4 if not 'ESP6500' in headers else headers.index('ESP6500') + 1) if not cons_headers: cons_headers = headers[:gtype_col] if cons_headers != headers[:gtype_col]: error('Header mismatch!') vcf_samples.append(headers[gtype_col:]) for line in vcf: sort_in.write('%d\t%s' % (vcf_index, line)) sort_in.close() print('\t'.join(cons_headers + sum(vcf_samples, []))) vcf_sample_counts = [len(samples) for samples in vcf_samples] S = sum(vcf_sample_counts) vcf_sample_col = [ sum(vcf_sample_counts[0:k]) for k in range(len(vcf_samples)) ] info('Merged VCF will contain:') info('- %d header columns' % len(cons_headers)) for samples, path in zip(vcf_samples, vcf_paths): info('- %d columns from %s' % (len(samples), path)) prev = None calls = [':0:0'] * S for line in sort_out: cols = line.rstrip('\n').split('\t') vcf_index = int(cols[0]) call_col = vcf_sample_col[vcf_index] if prev != cols[1:5]: if prev != None: print('\t'.join(prev + calls)) prev = cols[1:gtype_col + 1] calls = [':0:0'] * S calls[call_col:call_col+vcf_sample_counts[vcf_index]] = \ cols[gtype_col+1:] print('\t'.join(prev + calls)) # Handle the last line
def variant_signature(vcf_path, genome_path): vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('#'): break chromosomes = read_fasta(genome_path) headers = line.rstrip().split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 samples = headers[sample_col:] substitutions = [] for ref in 'CT': for alt in ('AGT' if ref == 'C' else 'ACG'): for pre in 'ACGT': for post in 'ACGT': substitutions.append(pre + ref + post + '>' + pre + alt + post) sub_count = np.zeros((len(substitutions), len(samples))) for line in vcf_file: cols = line[:-1].split('\t') if not cols[2] in 'ACGT' or not cols[3] in 'ACGT': continue chr = chromosomes[cols[0]] pos = int(cols[1]) if chr[pos - 1] != cols[2]: error('Reference mismatch!') ref = chr[pos - 2:pos + 1] alt = ref[0] + cols[3] + ref[2] if ref[1] in 'AG': ref = revcomplement(ref) alt = revcomplement(alt) for s, gt in enumerate(cols[sample_col:]): if gt_symbols.index(gt.split(':')[0]) > 1: sub_count[substitutions.index(ref + '>' + alt), s] += 1 print('SUBSTITUTION\t%s' % '\t'.join(samples)) for sub in substitutions: sys.stdout.write(sub) for count in sub_count[substitutions.index(sub), :]: sys.stdout.write('\t%d' % count) sys.stdout.write('\n')
def sam_unaligned_reads(bam_path): # The "samtools bam2fq" command does not output supplementary or # secondary alignments. Each read has max 1 primary alignment. options = '-n' if has_mate_suffixes(bam_path) else '' if has_base_qualities(bam_path): shell('samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' % (bam_path, options)) else: bam2fq = shell_stdout( 'samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' % (bam_path, options)) for line in bam2fq: if line[0] != '@': error('Invalid bam2fq output.') sys.stdout.write('>') sys.stdout.write(line[1:]) sys.stdout.write(next(bam2fq)) # Skip per-base qualities. They can start with '@'. next(bam2fq) next(bam2fq)
def variant_heterozygous_concordance(vcf_path, kgenomes_path, test_rx, ref_rx): is_snp = np.zeros(300 * 1000 * 1000, np.bool_) for line in zopen(kgenomes_path): pos = int(line[:-1].split('\t')[1]) is_snp[pos] = True vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('#'): break headers = line[:-1].split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 test_col = [ i for i, h in headers if re.search(test_rx, h) and i >= sample_col ] ref_col = [ i for i, h in headers if re.search(ref_rx, h) and i >= sample_col ] if len(test_col) != 1: error('Test sample not found.') if len(ref_col) != 1: error('Reference sample not found.') total_hetz_in_ref = 0 total_concordant = 0 for line in vcf_file: cols = line[:-1].split('\t') if not is_snp[int(cols[1])]: continue test = cols[test_col] test_gt = gt_symbols.index(test[:test.find(':')]) ref = cols[ref_col] ref_gt = gt_symbols.index(ref[:ref.find(':')]) if ref_gt == 2: total_hetz_in_ref += 1 total_concordant += (test_gt == 2) print('Concordance was %.1f%% (%d / %d).' % (float(total_concordant) / total_hetz_in_ref * 100, total_concordant, total_hetz_in_ref))
def parallel(command, job_name, max_workers, cpus, memory, partition, time_limit): # Allow splitting the command string onto multiple lines. command = command.replace('\n', ' ') if sys.stdin.isatty(): # If the user did not provide any input, just run the command once. # The command must not contain $x. if '$x' in command or '${x' in command: error('Command contains $x but no targets provided.') targets = [''] else: # Parse whitespace-delimited target items from standard input. targets = [] for line in sys.stdin: targets += line.split(' ') targets = [t.replace('\n', '') for t in targets] if not targets: error('Command requires targets but none provided.') if len(set(targets)) < len(targets): error('Target list contains multiple instances of the following targets:\n' + '\n'.join(s for s in set(targets) if targets.count(s) > 1)) if max_workers > len(targets): max_workers = len(targets) if partition != 'local': info('Distributing %d %s named "%s" on %s partition ' '(with %d %s and %d GB of memory per job).' % ( len(targets), 'jobs' if len(targets) != 1 else 'job', job_name, partition, cpus, 'CPUs' if cpus != 1 else 'CPU', memory)) else: info('Starting %d %s named "%s" on local machine.' % ( len(targets), 'jobs' if len(targets) != 1 else 'job', job_name)) log_dir = os.path.expanduser('~/.jobs/%s_%s' % (job_name, datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S'))) os.makedirs(log_dir) with open('%s/tasks' % log_dir, 'w') as f: f.write('%s\n' % command) for target in targets: f.write('%s\n' % target) if partition == 'local': worker_cmd = ['parallel', 'worker', log_dir] workers = [subprocess.Popen(worker_cmd) for w in range(max_workers)] for w in workers: w.wait() else: # Run the job steps on a SLURM cluster using sbatch. # Required memory is given in GB per job step. Convert to MB per CPU. mem_per_cpu = round(float(memory) / cpus * 1000) sbatch_script = sbatch_template % (partition, job_name, cpus, mem_per_cpu, 60 * time_limit, log_dir, log_dir, log_dir) workers = [subprocess.Popen(['sbatch', '-Q'], stdin=subprocess.PIPE) for p in range(max_workers)] for w in workers: w.stdin.write(sbatch_script.encode('utf-8')) w.stdin.close() for w in workers: w.wait()
def bed_composite(bed_path): features = {} for line in zopen(bed_path): if line.startswith('#'): continue c = line.rstrip('\n').split('\t') chr, start, end, name = c[0], int(c[1]), int(c[2]), c[3] feature = features.setdefault(name, [chr, [(start, end)]]) if chr != feature[0]: error('Chromosome mismatch.') segments = feature[1] overlapping = [seg for seg in segments if end >= seg[0] and start <= seg[1]] disjoint = [seg for seg in segments if not (end >= seg[0] and start <= seg[1])] disjoint.append((min([start] + [seg[0] for seg in overlapping]), max([end] + [seg[1] for seg in overlapping]))) feature[1] = disjoint for name, feature in features.iteritems(): segments = feature[1] for seg in segments: print('%s\t%d\t%d\t%s' % (feature[0], seg[0], seg[1], name))
def filter_variants(sv_path, min_reads, blacklist_path=None): read_rules = [r.split('-') for r in min_reads] for k, r in enumerate(read_rules): if len(r) != 3: error('Invalid minimum read rule %s specified.' % min_reads[k]) blacklist = set() if blacklist_path: blacklist = set([x.rstrip('\n') for x in open(blacklist_path)]) sv_file = open(sv_path) sys.stdout.write(next(sv_file)) # Header for line in sv_file: tokens = line.rstrip('\n').split('\t') valid = [ int(tokens[10]) >= int(rule[0]) and int(tokens[11]) >= int(rule[1]) and int(tokens[10]) + int(tokens[11]) >= int(rule[2]) for rule in read_rules ] if not any(valid): continue chrom = tokens[0] pos = int(tokens[2]) loci_1 = set(sv_locus_identifiers(chrom, pos)) chrom = tokens[5] pos = int(tokens[7]) loci_2 = set(sv_locus_identifiers(chrom, pos)) # We discard a rearrangement if *both* endpoints are located # in blacklisted regions. if loci_1.isdisjoint(blacklist) or loci_2.isdisjoint(blacklist): sys.stdout.write(line) sv_file.close()
def somatic(vcf_path, sample_pairs): vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('##'): break headers = line.rstrip().split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 samples = headers[sample_col:] # Convert sample pair names into index 2-tuples. sample_pairs = [pair.split(',') for pair in sample_pairs] if not all(len(pair) == 2 for pair in sample_pairs): info([pair for pair in sample_pairs if len(pair) != 2]) error('Test and control samples must be in "test,control" format.') for pair in sample_pairs: if not pair[0] in samples: error('Test sample %s was not found in VCF file.' % pair[0]) if not pair[1] in samples: error('Control sample %s was not found in VCF file.' % pair[1]) sample_pairs = [(samples.index(pair[0]), samples.index(pair[1])) for pair in sample_pairs] sys.stdout.write(line) for line in vcf_file: cols = line.rstrip('\n').split('\t') gt_cols = cols[sample_col:] genotypes = [gt_symbols.index(g[:g.find(':')]) for g in gt_cols] somatic = [ genotypes[pair[0]] >= 2 and genotypes[pair[1]] == 1 for pair in sample_pairs ] if not any(somatic): continue sys.stdout.write(line)
def calculate_BAF(bam_paths, genome_path, kgenomes_path, options): #print(bam_paths, genome_path, options.region, options.homz) gt_symbols = ['', '0/0', '0/1', '1/1'] if not os.path.exists(genome_path): error('Could not find genome FASTA file %s.' % genome_path) if options.region: for bam_path in bam_paths: if not os.path.exists(bam_path + '.bai'): error('No index found for BAM file %s.' % bam_path) samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths] # print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples)) print('CHROM\tPOSITION\tREF\tALT\t%s' % samples[0]) ignore_mapq = [False] * len(samples) if options.ignore_mapq: for s, sample in enumerate(samples): if re.search(options.ignore_mapq, sample) != None: ignore_mapq[s] = True info('Ignoring mapping quality for sample %s.' % sample) for line in simple_pileup(bam_paths, genome_path, kgenomes_path, min_mapq=options.min_mapq, min_alt_alleles=(0 if options.keep_all else options.min_hetz_reads), region=options.region): if type(line) == bytes: line = line.decode('utf8') tokens = line[:-1].split('\t') if len(tokens) < 3: error('Invalid spileup line:\n%s' % line) if tokens[2] == 'N': continue pileups = [p.split(' ') for p in tokens[3:]] #total_reads = np.zeros(len(samples)) #allele_reads = defaultdict(lambda: np.zeros(len(samples))) total_reads = [0] * len(samples) allele_reads = defaultdict(lambda: [0] * len(samples)) for s, pileup in enumerate(pileups): if len(pileup) < 3: continue for a in range(0, len(pileup), 3): count = int(pileup[a+1]) + \ (int(pileup[a+2]) if ignore_mapq[s] else 0) total_reads[s] += count if pileup[a] != '.': allele_reads[pileup[a]][s] = count # Call genotypes for each allele. # for alt, reads in allele_reads.iteritems(): for alt, reads in allele_reads.items(): genotypes = call_genotypes(reads, total_reads, options) # if not options.keep_all and all(gt < 2 for gt in genotypes): continue # if all(gt != 2 for gt in genotypes): continue if genotypes[1] != 2: continue gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s]) for s, g in enumerate(genotypes)) # Reformat indels in VCF4 format ref = tokens[2] if len(alt) >= 2: if alt[1] == '+': # Insertion alt = (ref if alt[0] == '.' else alt[0]) + alt[2:] elif alt[1] == '-': # Deletion ref += alt[2:] alt = (ref[0] if alt[0] == '.' else alt[0]) ####################### ## Hetrozygous bases ## ####################### gt_list = list(gtypes) gt_col = gt_list[1] ## genotype for the normal sample genotype = gt_symbols.index(gt_col[:gt_col.find(':')]) total_read = float(gt_col.split(':')[2]) if not (genotype == 2 and total_read >= 15): continue ######################### ## calculating the BAF ## ######################### read = gt_list[0].split(':')[1:3] ## reads for the tumor sample sys.stdout.write('\t'.join( [tokens[0], tokens[1], ref, alt.upper()])) alt, total = float(read[0]), int(read[1]) sys.stdout.write('\tNaN' if total == 0 else '\t%.2f' % (alt / total)) sys.stdout.write('\n')
def visualize_splicing(genes, fastq_prefix, out_prefix): genome_path = '/data/csb/organisms/homo_sapiens/hg19_flat' bed_path = '/data/csb/organisms/homo_sapiens/ensembl_68/exons.bed' genes = genes.replace(' ', '').split(',') min_anchor = 15 read_len = 90 trim = read_len - min_anchor chromosomes = read_flat_seq('/data/csb/organisms/homo_sapiens/hg19_flat') donors = [] acceptors = [] exons = [] for line in zopen(bed_path): cols = line[:-1].split('\t') if cols[3] in genes: chr = cols[0] if cols[0].startswith('chr') else 'chr'+cols[0] chr_seq = chromosomes[chr] pos = (int(cols[1])+1, int(cols[2])) if cols[5] == '+': acceptors.append((chr, '+', pos[0], chr_seq[pos[0]-1:pos[0]-1+trim])) donors.append((chr, '+', pos[1], chr_seq[pos[1]-trim:pos[1]])) elif cols[5] == '-': acceptors.append((chr, '-', pos[1], revcomplement(chr_seq[pos[1]-trim:pos[1]]))) donors.append((chr, '-', pos[0], revcomplement(chr_seq[pos[0]-1:pos[0]-1+trim]))) exons.append(pos) # Remove duplicate acceptors and donors. acceptors = list(set(acceptors)) donors = list(set(donors)) exons = list(set(exons)) # Calculate the contiguous genomic sequence chr = acceptors[0][0] if any(a[0] != chr for a in acceptors): error('Genes must be in the same chromosome!') genome_window = (min(a[2] for a in acceptors)-2000, max(a[2] for a in acceptors)+2000) #contig = chromosomes[chr][genome_window[0]:genome_window[1]] # Calculate junction sequences class Junction(object): def __init__(self, name, seq): self.name = name self.sequence = seq self.reads = 0 self.ratio = 0 junctions = defaultdict(list) # Group junctions by donor for left in donors: for right in acceptors: name = '%d[%s]_%d[%s]' % (left[2], left[1], right[2], right[1]) junctions[left].append(Junction(name, left[3] + right[3])) print('Generated %d junctions.' % (len(donors) * len(acceptors))) # Build Bowtie index index_fasta_path = '%s_ref.fa' % out_prefix index = open(index_fasta_path, 'w') #index.write('>contig\n%s\n' % contig) for donor in junctions: for junc in junctions[donor]: index.write('>%s\n%s\n' % (junc.name, junc.sequence)) index.close() shell('/data/csb/tools/bowtie-0.12.9/bowtie-build -q %s %s_index' % (index_fasta_path, out_prefix)) # Align reads against junctions and tally junction read counts. shell('bowtie -v1 -B1 -p8 %s_index <(gunzip -c %s_1.fq.gz %s_2.fq.gz) ' '> %s.bowtie' % (out_prefix, fastq_prefix, fastq_prefix, out_prefix)) junction_by_name = {} for donor in junctions: for j in junctions[donor]: junction_by_name[j.name] = j for line in open('%s.bowtie' % out_prefix): cols = line[:-1].split('\t') if not '_' in cols[2]: continue junction_by_name[cols[2]].reads += 1 # Calculate junction power relative to all outgoing links from donor for donor in junctions: total = sum(j.reads for j in junctions[donor]) if total <= 0: continue for j in junctions[donor]: j.ratio = float(j.reads) / total if j.reads > 0: print('%s: %.1f%% (%d)' % (j.name, j.ratio*100, j.reads)) # Check which exons actually participate in the mature transcripts active_edges = [] for donor in junctions: for j in junctions[donor]: if j.ratio < 0.05: continue active_edges += [int(x[:-3]) for x in j.name.split('_')] exons = [[ex[0], ex[1], False] for ex in exons] ties = [] for edge in set(active_edges): matches = [ex for ex in exons if edge in ex] if len(matches) == 1: matches[0][2] = True # Unique match, mark active if len(matches) > 1: ties.append(matches) for tie in ties: if not any(ex[2] for ex in tie): for ex in tie: ex[2] = True # If still tied, mark all tied active # Print exon map from svgfig import Rect, Frame, Poly rects = [Rect(ex[0], 1, ex[1], 2, stroke='none', fill='whitesmoke', stroke_linejoin='miter') for ex in exons if not ex[2]] rects += [Rect(ex[0], 1, ex[1], 2, stroke='none', fill='black', stroke_linejoin='miter') for ex in exons if ex[2]] lines = [] for donor in junctions: for j in junctions[donor]: start, end = [int(x[:-3]) for x in j.name.split('_')] lines.append(Poly([(start,2), ((start+end)/2,3), (end,2)], stroke_opacity=j.ratio)) Frame(genome_window[0], genome_window[1], 0, 10, *(rects+lines), width=500).SVG().save('%s.svg' % out_prefix) shell('rm %s_index.* %s_ref.fa' % (out_prefix, out_prefix))
def variant_top_mutated_regions(vcf_path, region_size): if region_size % 2: error('Region size must be divisible by two.') step = region_size / 2 vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('#'): break headers = line.rstrip('\n').split('\t') sample_col = headers.index('ESP6500' if 'ESP6500' in headers else 'ALT') + 1 samples = headers[sample_col:] # Construct chromosome map chr_sizes = defaultdict(int) for line in vcf_file: cols = line.rstrip('\n').split('\t') chr_sizes[cols[0]] = max(chr_sizes[cols[0]], int(cols[1])) vcf_file.close() mutated = {} # Which samples are mutated in each bin variant_pos = {} # Position of variant in bin, -1 if various for chr in chr_sizes: mutated[chr] = np.zeros((chr_sizes[chr] / step + 1, len(samples)), dtype=np.bool) variant_pos[chr] = np.zeros(chr_sizes[chr] / step + 1, dtype=np.int32) # Reopen VCF file (might be compressed), identify columns vcf_file = zopen(vcf_path) for line in vcf_file: if not line.startswith('#'): break # Tally mutated samples in each region print('Tallying mutated samples...') for line in vcf_file: cols = line.rstrip('\n').split('\t') pos = int(cols[1]) bin = (pos - 1) / step vpos = variant_pos[cols[0]] vpos[bin] = -1 if vpos[bin] > 0 and vpos[bin] != pos else pos if bin > 0: vpos[bin - 1] = -1 if vpos[bin - 1] > 0 and vpos[bin - 1] != pos else pos mut = mutated[cols[0]] for s, gt in enumerate(cols[sample_col:]): if gt_symbols.index(gt.split(':')[0]) <= 1: continue mut[bin, s] = True if bin > 0: mut[bin - 1, s] = True # Convert mutation bitmasks into counts print('Convert to counts...') for chr in mutated: mutated[chr] = mutated[chr].sum(axis=1) # Print regions in descending order starting with highest recurrence print('Find maximum...') highest = 0 for chr in mutated: highest = max(highest, max(mutated[chr])) print('Top regions with two or more mutated sites:') for n in range(highest, 1, -1): for chr in mutated: mut = mutated[chr] vpos = variant_pos[chr] for bin in range(len(mut)): if mut[bin] != n or vpos[bin] != -1: continue print('%s:%d-%d\t%d samples' % (chr, bin * step + 1, bin * step + region_size, n))
def backup(rules_path, interactive): passwords = {} rules = [] for line in open(rules_path): line = line.strip() if not line or line[0] == '#': continue tokens = line.strip().split() if len(tokens) != 2: error('Invalid rule: "%s"' % line) if not ':' in tokens[1]: error('Missing host: "%s"' % line) host, path = tokens[1].split(':') username = getpass.getuser() if '@' in host: username, host = host.split('@') if not os.path.isdir(tokens[0]): print('Directory %s does not exist. Ignoring rule...' % tokens[0]) continue rule = Object() rule.src_dir = tokens[0] rule.dst_host = host rule.dst_dir = path rule.username = username rule.password = passwords[host] if host in passwords else \ getpass.getpass('Password for %s: ' % host) passwords[host] = rule.password rules.append(rule) def lftp_mirror(rule, dry_run=False): cmds = open('.lftp_script', 'w') cmds.write('open -u %s,%s sftp://%s\n' % ( rule.username, rule.password, rule.dst_host)) cmds.write('mirror -P3 -Rae %s %s %s\n' % ( '--dry-run' if dry_run else '-v', rule.src_dir, rule.dst_dir)) cmds.close() if dry_run: userpass = rule.username + ':' + rule.password + '@' host = rule.dst_host out = shell_stdout('lftp -f .lftp_script') for line in out: if line.startswith('chmod'): continue if line.startswith('mkdir'): continue m = re.match('get -O sftp://(.+) file:/.+/(.+)', line) if m: dst = m.group(1) if dst.startswith(userpass): dst = dst[len(userpass):] if dst.startswith(host): dst = dst[len(host):] print('ADD %s/%s' % (dst, m.group(2))) continue m = re.match('get -e -O sftp://(.+) file:/.+/(.+)', line) if m: dst = m.group(1) if dst.startswith(userpass): dst = dst[len(userpass):] if dst.startswith(host): dst = dst[len(host):] print('UPDATE %s/%s' % (dst, m.group(2))) continue m = re.match('rm .*sftp://(.+)', line) if m: dst = m.group(1) if dst.startswith(userpass): dst = dst[len(userpass):] if dst.startswith(host): dst = dst[len(host):] print('DELETE %s' % dst) continue sys.stdout.write(line) else: shell('lftp -f .lftp_script') os.remove('.lftp_script') for rule in rules: lftp_mirror(rule, dry_run=True) if interactive: if not raw_input('Proceed with backup? [y/N] ').lower() in ('y','yes'): error('Backup canceled.') for rule in rules: lftp_mirror(rule)
def detect_rearrangements(sam_path, genome_path, out_prefix, anchor_len, min_mapq, orientation, max_frag_len, discard_duplicates='both-ends'): if not os.path.exists(sam_path): error('File %s does not exist.' % sam_path) if not discard_duplicates in ('no', 'both-ends', 'one-end'): error('Invalid duplicate discard method: %s' % discard_duplicates) detect_discordant_pairs(sam_path, out_prefix, max_frag_len=max_frag_len, min_mapq=min_mapq, orientation=orientation) # Execute split read analysis if the user has specified an anchor length. if anchor_len > 0: detect_discordant_reads(sam_path, genome_path, out_prefix, anchor_len) info('Sorting discordant pairs by chromosomal position...') sort_inputs = '<(gunzip -c %s.discordant_pairs.tsv.gz)' % out_prefix if anchor_len > 0: sort_inputs += ' <(gunzip -c %s.discordant_reads.tsv.gz)' % out_prefix sort_tmp_dir = os.path.dirname(out_prefix) if not sort_tmp_dir: sort_tmp_dir = './' shell('sort -k1,1 -k3,3n -T %s %s | gzip -c > %s.sorted_pairs.tsv.gz' % (sort_tmp_dir, sort_inputs, out_prefix)) def report_rearrangement(out, r): if discard_duplicates == 'both-ends': discard_duplicates_both_ends(r) elif discard_duplicates == 'one-end': discard_duplicates_one_end(r) if len(r.reads) < 2: return 0 out.write('%s\t%s\t%d\t\t\t%s\t%s\t%d\t\t\t%d\t%d\t%s\n' % (r.chr, r.strand, r.pos, r.mchr, r.mstrand, r.mpos, sum([read[2] == None for read in r.reads]), sum([read[2] != None for read in r.reads]), ';'.join( [read[2] for read in r.reads if read[2] != None]))) return 1 info('Identifying rearrangements based on clusters of discordant reads...') out = open('%s.sv' % out_prefix, 'w') out.write(sv_file_header + '\n') N = 0 rearrangements = [] for line in zopen('%s.sorted_pairs.tsv.gz' % out_prefix): al = line[:-1].split('\t') chr = al[0] strand = al[1] pos = int(al[2]) mchr = al[3] mstrand = al[4] mpos = int(al[5]) seq = None if al[6] == '-' else al[6] # Rearrangements that are too far need not be considered in the future reachable = [] for r in rearrangements: if pos - r.pos > max_frag_len: N += report_rearrangement(out, r) else: reachable.append(r) rearrangements = reachable # Check if we already have a rearrangement that matches the new pair. # We don't check the distance for the first mate because we already # know from above the rearrangements near it. matches = [ r for r in rearrangements if abs(mpos - r.mpos) <= max_frag_len and chr == r.chr and mchr == r.mchr and strand == r.strand and mstrand == r.mstrand ] read = (pos, mpos, seq) if matches: for match in matches: match.reads.append(read) else: # No suitable rearrangements, create a new one. rearrangements.append( Rearrangement(chr, strand, pos, mchr, mstrand, mpos, read)) for r in rearrangements: N += report_rearrangement(out, r) info('Found %d rearrangements with at least 2 reads of evidence.' % N)
def detect_discordant_pairs(sam_path, out_prefix, max_frag_len, min_mapq, orientation): out = zopen(out_prefix + '.discordant_pairs.tsv.gz', 'w') N = 0 sort_tmp_dir = os.path.dirname(out_prefix) if not sort_tmp_dir: sort_tmp_dir = './' # Go through all the first mates and look for discordant pairs. info('Searching for discordant read pairs...') prev = [''] for line in shell_stdout( 'sam discordant pairs --min-mapq=%d %s %d | sort -k1,1 -T %s' % (min_mapq, sam_path, max_frag_len, sort_tmp_dir)): al = line.split('\t') if len(al) < 9: continue # Discard spliced and clipped reads. # FIXME: Add support for spliced RNA-seq reads. if 'N' in al[5] or 'S' in al[5]: continue if al[0].endswith('/1') or al[0].endswith('/2'): al[0] = al[0][:-2] # Remove /1 or /2 suffix if al[0] != prev[0]: prev = al continue flags = int(al[1]) chr = al[2] mchr = prev[2] strand = '-' if flags & 0x10 else '+' mstrand = '-' if flags & 0x20 else '+' pos = int(al[3]) mpos = int(prev[3]) rlen = len(al[9]) mrlen = len(prev[9]) if not chr.startswith('chr'): chr = 'chr' + chr if not mchr.startswith('chr'): mchr = 'chr' + mchr if chr == 'chrM' or mchr == 'chrM': continue # Discard mitochondrial if orientation == 'fr': # Reorient pairs so that the first mate is always upstream. if chr > mchr or (chr == mchr and pos > mpos): chr, mchr = mchr, chr pos, mpos = mpos, pos rlen, mrlen = mrlen, rlen strand, mstrand = mstrand, strand # Convert to forward-forward orientation (flip second mate). mstrand = '-' if mstrand == '+' else '+' elif orientation == 'rf': # Reorient pairs so that the first mate is always upstream. if chr > mchr or (chr == mchr and pos > mpos): chr, mchr = mchr, chr pos, mpos = mpos, pos rlen, mrlen = mrlen, rlen strand, mstrand = mstrand, strand # Convert to forward-forward orientation (flip first mate). strand = '-' if strand == '+' else '+' elif orientation == 'ff': # Reorient pairs so that the first mate is always upstream. # If mates are swapped, both mates must be reversed. if chr > mchr or (chr == mchr and pos > mpos): chr, mchr = mchr, chr pos, mpos = mpos, pos rlen, mrlen = mrlen, rlen strand, mstrand = '+' if mstrand == '-' else '-', \ '+' if strand == '-' else '-' else: error('Unsupported read orientation detected.') # Make positions represent read starts. if strand == '-': pos += rlen - 1 if mstrand == '-': mpos += mrlen - 1 # Each discordant mate pair is represented as a 7-tuple # (chr_1, strand_1, pos_1, chr_2, strand_2, pos_2, None). # The None at the end signifies that this is a mate pair. # Positions are 1-based and represent read starts. out.write('%s\t%s\t%d\t%s\t%s\t%d\t-\n' % (chr, strand, pos, mchr, mstrand, mpos)) N += 1 out.close() info('Found %d discordant mate pairs.' % N)
def variant_call(bam_paths, genome_path, options): if not os.path.exists(genome_path): error('Could not find genome FASTA file %s.' % genome_path) if options.region: for bam_path in bam_paths: if not os.path.exists(bam_path + '.bai'): error('No index found for BAM file %s.' % bam_path) samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths] print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples)) ignore_mapq = [False] * len(samples) if options.ignore_mapq: for s, sample in enumerate(samples): if re.search(options.ignore_mapq, sample) != None: ignore_mapq[s] = True info('Ignoring mapping quality for sample %s.' % sample) for line in simple_pileup(bam_paths, genome_path, min_mapq=options.min_mapq, min_alt_alleles=(0 if options.keep_all else options.min_hetz_reads), region=options.region): tokens = line[:-1].split('\t') if len(tokens) < 3: error('Invalid spileup line:\n%s' % line) if tokens[2] == 'N': continue pileups = [p.split(' ') for p in tokens[3:]] #total_reads = np.zeros(len(samples)) #allele_reads = defaultdict(lambda: np.zeros(len(samples))) total_reads = [0] * len(samples) allele_reads = defaultdict(lambda: [0] * len(samples)) for s, pileup in enumerate(pileups): if len(pileup) < 3: continue for a in range(0, len(pileup), 3): count = int(pileup[a+1]) + \ (int(pileup[a+2]) if ignore_mapq[s] else 0) total_reads[s] += count if pileup[a] != '.': allele_reads[pileup[a]][s] = count # Call genotypes for each allele. for alt, reads in allele_reads.iteritems(): genotypes = call_genotypes(reads, total_reads, options) if not options.keep_all and all(gt < 2 for gt in genotypes): continue gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s]) for s, g in enumerate(genotypes)) # Reformat indels in VCF4 format ref = tokens[2] if len(alt) >= 2: if alt[1] == '+': # Insertion alt = (ref if alt[0] == '.' else alt[0]) + alt[2:] elif alt[1] == '-': # Deletion ref += alt[2:] alt = (ref[0] if alt[0] == '.' else alt[0]) print('%s\t%s\t%s\t%s\t%s' % (tokens[0], tokens[1], ref, alt.upper(), '\t'.join(gtypes)))
def calculate_BAF(bam_paths, genome_path, kgenomes_path, options): #print(bam_paths, genome_path, options.region, options.homz) gt_symbols = ['', '0/0', '0/1', '1/1'] if not os.path.exists(genome_path): error('Could not find genome FASTA file %s.' % genome_path) if options.region: for bam_path in bam_paths: if not os.path.exists(bam_path + '.bai'): error('No index found for BAM file %s.' % bam_path) samples = [os.path.basename(p).replace('.bam', '') for p in bam_paths] # print('CHROM\tPOSITION\tREF\tALT\t%s' % '\t'.join(samples)) print('CHROM\tPOSITION\tREF\tALT\t%s' %samples[0]) ignore_mapq = [False] * len(samples) if options.ignore_mapq: for s, sample in enumerate(samples): if re.search(options.ignore_mapq, sample) != None: ignore_mapq[s] = True info('Ignoring mapping quality for sample %s.' % sample) for line in simple_pileup(bam_paths, genome_path, kgenomes_path, min_mapq=options.min_mapq, min_alt_alleles=(0 if options.keep_all else options.min_hetz_reads), region=options.region): if type(line) == bytes: line = line.decode('utf8') tokens = line[:-1].split('\t') if len(tokens) < 3: error('Invalid spileup line:\n%s' % line) if tokens[2] == 'N': continue pileups = [p.split(' ') for p in tokens[3:]] #total_reads = np.zeros(len(samples)) #allele_reads = defaultdict(lambda: np.zeros(len(samples))) total_reads = [0] * len(samples) allele_reads = defaultdict(lambda: [0] * len(samples)) for s, pileup in enumerate(pileups): if len(pileup) < 3: continue for a in range(0, len(pileup), 3): count = int(pileup[a+1]) + \ (int(pileup[a+2]) if ignore_mapq[s] else 0) total_reads[s] += count if pileup[a] != '.': allele_reads[pileup[a]][s] = count # Call genotypes for each allele. # for alt, reads in allele_reads.iteritems(): for alt, reads in allele_reads.items(): genotypes = call_genotypes(reads, total_reads, options) # if not options.keep_all and all(gt < 2 for gt in genotypes): continue # if all(gt != 2 for gt in genotypes): continue if genotypes[1] != 2: continue gtypes = ('%s:%d:%d' % (gt_symbols[g], reads[s], total_reads[s]) for s, g in enumerate(genotypes)) # Reformat indels in VCF4 format ref = tokens[2] if len(alt) >= 2: if alt[1] == '+': # Insertion alt = (ref if alt[0] == '.' else alt[0]) + alt[2:] elif alt[1] == '-': # Deletion ref += alt[2:] alt = (ref[0] if alt[0] == '.' else alt[0]) ####################### ## Hetrozygous bases ## ####################### gt_list = list(gtypes) gt_col = gt_list[1] ## genotype for the normal sample genotype = gt_symbols.index(gt_col[:gt_col.find(':')]) total_read = float(gt_col.split(':')[2]) if not (genotype == 2 and total_read >= 15): continue ######################### ## calculating the BAF ## ######################### read = gt_list[0].split(':')[1:3] ## reads for the tumor sample sys.stdout.write('\t'.join([tokens[0], tokens[1], ref, alt.upper()])) alt, total = float(read[0]), int(read[1]) sys.stdout.write('\tNaN' if total == 0 else '\t%.2f' % (alt / total)) sys.stdout.write('\n')
elif args['reads'] and args['--raw']: sam_reads_raw(args['<bam_file>'], args['<out_prefix>']) elif args['reads']: sam_reads(args['<bam_file>'], args['<out_prefix>']) elif args['compact']: sam_compact(args['<bam_file>']) elif args['discordant'] and args['pairs']: sam_discordant_pairs(args['<bam_file>'], int(args['<max_frag_size>']), orientation=args['--orientation'], min_mapq=int(args['--min-mapq'])) elif args['fragments']: sam_fragments(args['<bam_file>'], int(args['<max_frag_len>'])) elif args['read'] and args['length']: read_len = read_length(args['<bam_file>']) if not read_len: error('Could not determine read length.') else: print('%d' % read_len) elif args['pileup'] and args['each']: sam_pileup_each(args['<vcf_file>'], args['<bam_files>'], min_al_quality=int(args['--min-mapq'])) elif args['pileup']: sam_pileup(args['<region>'], args['<bam_files>'], min_al_quality=int(args['--min-mapq'])) elif args['count']: sam_count(args['<bam_file>'], args['<bed_file>']) elif args['counts'] and args['merge']: sam_merge_counts(args['<bed_file>'], args['<count_files>']) elif args['fragment'] and args['lengths']: sam_fragment_lengths(args['<bam_file>'])