def variant_annotate(vcf_path, genome='~/tools/annovar-2016-02-01/humandb/hg38'): format_annovar(vcf_path, 'anno_tmp.vcf') humandb_dir, genome_version = os.path.split(genome) shell('table_annovar.pl anno_tmp.vcf %s -buildver %s --remove --otherinfo ' '--outfile annotated -operation g,f,f,f ' '-protocol refGene,cosmic70,1000g2014oct_all,exac03' % (humandb_dir, genome_version)) anno = open('annotated.%s_multianno.txt' % genome_version) out = zopen('annotated.vcf.gz', 'w') anno.next() line = anno.next() headers = [ 'CHROM', 'POSITION', 'REF', 'ALT', 'FUNCTION', 'GENE', 'EXONIC_FUNCTION', 'AA_CHANGE', 'COSMIC', '1000G', 'EXAC' ] headers += line.rstrip('\n').split('\t')[20:] out.write('\t'.join(headers) + '\n') for line in anno: c = line.rstrip('\n').split('\t') out.write('\t'.join(c[0:2] + c[3:7] + c[8:13] + c[20:])) out.write('\n') out.close() os.remove('anno_tmp.vcf') os.remove('annotated.%s_multianno.txt' % genome_version) if num_lines('annotated.invalid_input') <= 1: os.remove('annotated.invalid_input') if num_lines('annotated.refGene.invalid_input') <= 1: os.remove('annotated.refGene.invalid_input')
def sam_count(bam_path, bed_path, genome_path='~/organisms/homo_sapiens/hg19.chrom.sizes'): bed_cols = len(next(open(bed_path)).split('\t')) shell( 'bedtools coverage -split -sorted -counts -g %s -a %s -b %s | cut -f%d' % (genome_path, bed_path, bam_path, bed_cols + 1))
def swiss_download_sra(sra_study): if not sra_study.startswith('SRP'): error('SRA study identifier must begin with "SRP".') shell('/data/csb/tools/ncftp-3.2.5/bin/ncftpget -R -v ' 'ftp-trace.ncbi.nlm.nih.gov ./ ' '/sra/sra-instant/reads/ByStudy/sra/SRP/%s/%s' % (sra_study[:6], sra_study))
def lftp_mirror(rule, dry_run=False): cmds = open('.lftp_script', 'w') cmds.write('open -u %s,%s sftp://%s\n' % ( rule.username, rule.password, rule.dst_host)) cmds.write('mirror -P3 -Rae %s %s %s\n' % ( '--dry-run' if dry_run else '-v', rule.src_dir, rule.dst_dir)) cmds.close() if dry_run: userpass = rule.username + ':' + rule.password + '@' host = rule.dst_host out = shell_stdout('lftp -f .lftp_script') for line in out: if line.startswith('chmod'): continue if line.startswith('mkdir'): continue m = re.match('get -O sftp://(.+) file:/.+/(.+)', line) if m: dst = m.group(1) if dst.startswith(userpass): dst = dst[len(userpass):] if dst.startswith(host): dst = dst[len(host):] print('ADD %s/%s' % (dst, m.group(2))) continue m = re.match('get -e -O sftp://(.+) file:/.+/(.+)', line) if m: dst = m.group(1) if dst.startswith(userpass): dst = dst[len(userpass):] if dst.startswith(host): dst = dst[len(host):] print('UPDATE %s/%s' % (dst, m.group(2))) continue m = re.match('rm .*sftp://(.+)', line) if m: dst = m.group(1) if dst.startswith(userpass): dst = dst[len(userpass):] if dst.startswith(host): dst = dst[len(host):] print('DELETE %s' % dst) continue sys.stdout.write(line) else: shell('lftp -f .lftp_script') os.remove('.lftp_script')
def sam_compact(bam_path): shell('samtools view -H %s' % bam_path) short_id = {} id_counter = 1 for al in read_sam(bam_path, 'D'): rid = al[0] if rid[-2] == '/': rid = rid[:-2] new_id = short_id.get(rid) if new_id: del short_id[rid] else: new_id = str(id_counter) id_counter += 1 short_id[rid] = new_id al[0] = new_id al[10] = '*' sys.stdout.write('\t'.join(al))
def parallel_worker(log_dir): with open('%s/tasks' % log_dir) as f: command = next(f).strip() targets = [target.strip() for target in f] for target in targets: out = open_exclusive('%s/%s.out' % (log_dir, sanitize_path(target))) if not out: continue cmd_with_target = 'export x=%s; %s' % (target, command) out.write('%s\n%s\n' % (cmd_with_target, '-'*80)) out.flush() start_time = datetime.datetime.now() shell(cmd_with_target, stdout=out, stderr=out) end_time = datetime.datetime.now() out.write('%s\nJOB FINISHED. ELAPSED TIME WAS %s.\n' % ('-'*80, end_time - start_time)) out.close()
def cghub_download(samples): for sample in samples: # Don't redownload files that are already present. existing = {} for root, dirnames, filenames in os.walk('.'): for f in filenames: path = os.path.join(root, f) existing[f] = os.stat(path).st_size filename = sample.files[0] filesize = sample.filesizes[0] if filename in existing and existing[filename] == filesize: info('%s has already been downloaded...' % filename) continue info('Downloading %s...' % filename) shell('gtdownload -v -d %s -c ~/tools/genetorrent*/cghub_2016.key' % sample.analysis_data_uri)
def sam_unaligned_reads(bam_path): # The "samtools bam2fq" command does not output supplementary or # secondary alignments. Each read has max 1 primary alignment. options = '-n' if has_mate_suffixes(bam_path) else '' if has_base_qualities(bam_path): shell('samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' % (bam_path, options)) else: bam2fq = shell_stdout( 'samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' % (bam_path, options)) for line in bam2fq: if line[0] != '@': error('Invalid bam2fq output.') sys.stdout.write('>') sys.stdout.write(line[1:]) sys.stdout.write(next(bam2fq)) # Skip per-base qualities. They can start with '@'. next(bam2fq) next(bam2fq)
def fasta_from_sra(sra_path): shell('~/tools/sratoolkit*/fastq-dump --split-3 --gzip %s' % sra_path)
def visualize_splicing(genes, fastq_prefix, out_prefix): genome_path = '/data/csb/organisms/homo_sapiens/hg19_flat' bed_path = '/data/csb/organisms/homo_sapiens/ensembl_68/exons.bed' genes = genes.replace(' ', '').split(',') min_anchor = 15 read_len = 90 trim = read_len - min_anchor chromosomes = read_flat_seq('/data/csb/organisms/homo_sapiens/hg19_flat') donors = [] acceptors = [] exons = [] for line in zopen(bed_path): cols = line[:-1].split('\t') if cols[3] in genes: chr = cols[0] if cols[0].startswith('chr') else 'chr'+cols[0] chr_seq = chromosomes[chr] pos = (int(cols[1])+1, int(cols[2])) if cols[5] == '+': acceptors.append((chr, '+', pos[0], chr_seq[pos[0]-1:pos[0]-1+trim])) donors.append((chr, '+', pos[1], chr_seq[pos[1]-trim:pos[1]])) elif cols[5] == '-': acceptors.append((chr, '-', pos[1], revcomplement(chr_seq[pos[1]-trim:pos[1]]))) donors.append((chr, '-', pos[0], revcomplement(chr_seq[pos[0]-1:pos[0]-1+trim]))) exons.append(pos) # Remove duplicate acceptors and donors. acceptors = list(set(acceptors)) donors = list(set(donors)) exons = list(set(exons)) # Calculate the contiguous genomic sequence chr = acceptors[0][0] if any(a[0] != chr for a in acceptors): error('Genes must be in the same chromosome!') genome_window = (min(a[2] for a in acceptors)-2000, max(a[2] for a in acceptors)+2000) #contig = chromosomes[chr][genome_window[0]:genome_window[1]] # Calculate junction sequences class Junction(object): def __init__(self, name, seq): self.name = name self.sequence = seq self.reads = 0 self.ratio = 0 junctions = defaultdict(list) # Group junctions by donor for left in donors: for right in acceptors: name = '%d[%s]_%d[%s]' % (left[2], left[1], right[2], right[1]) junctions[left].append(Junction(name, left[3] + right[3])) print('Generated %d junctions.' % (len(donors) * len(acceptors))) # Build Bowtie index index_fasta_path = '%s_ref.fa' % out_prefix index = open(index_fasta_path, 'w') #index.write('>contig\n%s\n' % contig) for donor in junctions: for junc in junctions[donor]: index.write('>%s\n%s\n' % (junc.name, junc.sequence)) index.close() shell('/data/csb/tools/bowtie-0.12.9/bowtie-build -q %s %s_index' % (index_fasta_path, out_prefix)) # Align reads against junctions and tally junction read counts. shell('bowtie -v1 -B1 -p8 %s_index <(gunzip -c %s_1.fq.gz %s_2.fq.gz) ' '> %s.bowtie' % (out_prefix, fastq_prefix, fastq_prefix, out_prefix)) junction_by_name = {} for donor in junctions: for j in junctions[donor]: junction_by_name[j.name] = j for line in open('%s.bowtie' % out_prefix): cols = line[:-1].split('\t') if not '_' in cols[2]: continue junction_by_name[cols[2]].reads += 1 # Calculate junction power relative to all outgoing links from donor for donor in junctions: total = sum(j.reads for j in junctions[donor]) if total <= 0: continue for j in junctions[donor]: j.ratio = float(j.reads) / total if j.reads > 0: print('%s: %.1f%% (%d)' % (j.name, j.ratio*100, j.reads)) # Check which exons actually participate in the mature transcripts active_edges = [] for donor in junctions: for j in junctions[donor]: if j.ratio < 0.05: continue active_edges += [int(x[:-3]) for x in j.name.split('_')] exons = [[ex[0], ex[1], False] for ex in exons] ties = [] for edge in set(active_edges): matches = [ex for ex in exons if edge in ex] if len(matches) == 1: matches[0][2] = True # Unique match, mark active if len(matches) > 1: ties.append(matches) for tie in ties: if not any(ex[2] for ex in tie): for ex in tie: ex[2] = True # If still tied, mark all tied active # Print exon map from svgfig import Rect, Frame, Poly rects = [Rect(ex[0], 1, ex[1], 2, stroke='none', fill='whitesmoke', stroke_linejoin='miter') for ex in exons if not ex[2]] rects += [Rect(ex[0], 1, ex[1], 2, stroke='none', fill='black', stroke_linejoin='miter') for ex in exons if ex[2]] lines = [] for donor in junctions: for j in junctions[donor]: start, end = [int(x[:-3]) for x in j.name.split('_')] lines.append(Poly([(start,2), ((start+end)/2,3), (end,2)], stroke_opacity=j.ratio)) Frame(genome_window[0], genome_window[1], 0, 10, *(rects+lines), width=500).SVG().save('%s.svg' % out_prefix) shell('rm %s_index.* %s_ref.fa' % (out_prefix, out_prefix))
def detect_specific(bam_path, donors_path, acceptors_path, genome_path, out_prefix, all_reads): read_len = sam.read_length(bam_path) info('Using read length %d bp...' % read_len) flank_len = read_len - 10 chromosomes = read_fasta(genome_path) donor_exons = regions_from_bed(donors_path) donors = [] for ex in donor_exons: chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0] chr_seq = chromosomes[chr] if ex[1] == '+': donors.append((chr, '+', ex[3], chr_seq[ex[3] - flank_len:ex[3]])) elif ex[1] == '-': donors.append( (chr, '-', ex[2], revcomplement(chr_seq[ex[2] - 1:ex[2] - 1 + flank_len]))) acceptor_exons = regions_from_bed(acceptors_path) acceptors = [] for ex in acceptor_exons: chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0] chr_seq = chromosomes[chr] if ex[1] == '+': acceptors.append( (chr, '+', ex[2], chr_seq[ex[2] - 1:ex[2] - 1 + flank_len])) elif ex[1] == '-': acceptors.append((chr, '-', ex[3], revcomplement(chr_seq[ex[3] - flank_len:ex[3]]))) del chromosomes # Release 3 GB of memory gc.collect() # Remove duplicate acceptors and donors. acceptors = list(set(acceptors)) donors = list(set(donors)) # Calculate junction sequences junctions = {} for left in donors: for right in acceptors: name = '%s:%s:%d_%s:%s:%d' % (left[:3] + right[:3]) junctions[name] = Object(sequence=left[3] + right[3], reads=[]) info('Generated %d junctions.' % len(junctions)) # Build Bowtie index info('Constructing junction FASTA file...') index_fasta_path = out_prefix + '_ref.fa' index = open(index_fasta_path, 'w') for name, junction in junctions.iteritems(): index.write('>%s\n%s\n' % (name, junction.sequence)) index.close() info('Constructing Bowtie index...') shell('bowtie-build -q %s %s_index' % (index_fasta_path, out_prefix)) # Align reads against junctions and tally junction read counts. if all_reads: info('Aligning all reads against index...') reads_command = 'sam reads %s' % bam_path else: info('Aligning unaligned reads against index...') reads_command = 'sam unaligned reads %s' % bam_path for line in shell_stdout('bowtie -f -v1 -B1 %s_index <(%s)' % (out_prefix, reads_command)): cols = line.rstrip().split('\t') junctions[cols[2]].reads.append(cols[4]) shell('rm %s_index.* %s_ref.fa' % (out_prefix, out_prefix)) out_file = open(out_prefix + '.tsv', 'w') out_file.write('5\' breakpoint\t3\' breakpoint\tNum reads\tSequences\n') for name, j in junctions.iteritems(): if not j.reads: continue flanks = name.split('_') out_file.write('%s\t%s\t%d\t' % (flanks[0], flanks[1], len(j.reads))) #out_file.write(';'.join(j.reads)) out_file.write('\n') out_file.close()
def detect_rearrangements(sam_path, genome_path, out_prefix, anchor_len, min_mapq, orientation, max_frag_len, discard_duplicates='both-ends'): if not os.path.exists(sam_path): error('File %s does not exist.' % sam_path) if not discard_duplicates in ('no', 'both-ends', 'one-end'): error('Invalid duplicate discard method: %s' % discard_duplicates) detect_discordant_pairs(sam_path, out_prefix, max_frag_len=max_frag_len, min_mapq=min_mapq, orientation=orientation) # Execute split read analysis if the user has specified an anchor length. if anchor_len > 0: detect_discordant_reads(sam_path, genome_path, out_prefix, anchor_len) info('Sorting discordant pairs by chromosomal position...') sort_inputs = '<(gunzip -c %s.discordant_pairs.tsv.gz)' % out_prefix if anchor_len > 0: sort_inputs += ' <(gunzip -c %s.discordant_reads.tsv.gz)' % out_prefix sort_tmp_dir = os.path.dirname(out_prefix) if not sort_tmp_dir: sort_tmp_dir = './' shell('sort -k1,1 -k3,3n -T %s %s | gzip -c > %s.sorted_pairs.tsv.gz' % (sort_tmp_dir, sort_inputs, out_prefix)) def report_rearrangement(out, r): if discard_duplicates == 'both-ends': discard_duplicates_both_ends(r) elif discard_duplicates == 'one-end': discard_duplicates_one_end(r) if len(r.reads) < 2: return 0 out.write('%s\t%s\t%d\t\t\t%s\t%s\t%d\t\t\t%d\t%d\t%s\n' % (r.chr, r.strand, r.pos, r.mchr, r.mstrand, r.mpos, sum([read[2] == None for read in r.reads]), sum([read[2] != None for read in r.reads]), ';'.join( [read[2] for read in r.reads if read[2] != None]))) return 1 info('Identifying rearrangements based on clusters of discordant reads...') out = open('%s.sv' % out_prefix, 'w') out.write(sv_file_header + '\n') N = 0 rearrangements = [] for line in zopen('%s.sorted_pairs.tsv.gz' % out_prefix): al = line[:-1].split('\t') chr = al[0] strand = al[1] pos = int(al[2]) mchr = al[3] mstrand = al[4] mpos = int(al[5]) seq = None if al[6] == '-' else al[6] # Rearrangements that are too far need not be considered in the future reachable = [] for r in rearrangements: if pos - r.pos > max_frag_len: N += report_rearrangement(out, r) else: reachable.append(r) rearrangements = reachable # Check if we already have a rearrangement that matches the new pair. # We don't check the distance for the first mate because we already # know from above the rearrangements near it. matches = [ r for r in rearrangements if abs(mpos - r.mpos) <= max_frag_len and chr == r.chr and mchr == r.mchr and strand == r.strand and mstrand == r.mstrand ] read = (pos, mpos, seq) if matches: for match in matches: match.reads.append(read) else: # No suitable rearrangements, create a new one. rearrangements.append( Rearrangement(chr, strand, pos, mchr, mstrand, mpos, read)) for r in rearrangements: N += report_rearrangement(out, r) info('Found %d rearrangements with at least 2 reads of evidence.' % N)