def sort(file_name, sorted_prefix=None): """ Sorts and indexes the bam file given by file_name. """ if sorted_prefix is None: sorted_prefix = file_name.replace('.bam', '') + '_sorted' sorted_name = sorted_prefix + ".bam" log_subprocess.check_call(['samtools','sort', '-o', sorted_name, file_name])
def output_as_tsv(vcf_path, out_path, output_gt_info=False): """ Outputs all of the information from the vcf file as one big tsv """ out_file = open(out_path, 'w') if output_gt_info: log_subprocess.check_call(['vcf2tsv', vcf_path, '-g'], stdout=out_file) else: log_subprocess.check_call(['vcf2tsv', vcf_path], stdout=out_file) out_file.close()
def output_setdiff_vcfs(vcf1_path, vcf2_path, genome, out_path): """ Outputs a VCF file which contains variants in the first but not the second VCF file. """ out_file = open(out_path, 'w') ref_path = FASTA_LOCATION + genome + '/' + genome + '.fa' log_subprocess.check_call( ['vcfintersect', vcf1_path, '-i', vcf2_path, '-v', '-r', ref_path], stdout=out_file) out_file.close()
def output_intersect_vcfs(vcf1_path, vcf2_path, genome, out_path): """ Outputs a vcf which is the intersection of the two given vcfs. """ ref_path = FASTA_LOCATION + genome + '/' + genome + '.fa' out_file = open(out_path, 'w') log_subprocess.check_call( ['vcfintersect', vcf1_path, '-i', vcf2_path, '-r', ref_path], stdout=out_file) out_file.close()
def sort_tabix_gtf(input_gtf, output_gtf): ''' Tabix a GTF. This is tricky because it needs to be sorted and block-gzipped first. NOTE: this will mess up the conventional ordering of genes/transcripts/exons/etc.''' cat = "zcat" if input_gtf.endswith(".gz") else "cat" log_subprocess.check_call( '{0} {1} | sort -k1,1 -k4,5n | bgzip -c > {2}'.format( cat, input_gtf, output_gtf), shell=True) log_subprocess.check_call("tabix -p gff {0}".format(output_gtf), shell=True)
def output_restrict_location_vcf(vcf_path, bed_path, genome, out_path): """ Outputs a vcf restricted to the locations specified in the given bed file """ ref_path = FASTA_LOCATION + genome + '/' + genome + '.fa' out_file = open(out_path, 'w') print ' '.join(['vcfintersect', '-b', bed_path, '-r', ref_path, vcf_path]) log_subprocess.check_call( ['vcfintersect', '-b', bed_path, '-r', ref_path, vcf_path], stdout=out_file) out_file.close()
def sort_by_name(file_name, sorted_prefix=None): """ Sorts a bam file by the read name, for paired-end """ if sorted_prefix is None: sorted_prefix = file_name.replace('.bam', '') + '_namesorted' sorted_name = sorted_prefix + '.bam' # NOTE -- need to update our internal samtools in order to use pysam.sort #pysam.sort('-n', file_name, sorted_prefix) log_subprocess.check_call(['samtools', 'sort', '-n', file_name, sorted_prefix]) return pysam.Samfile(sorted_name, 'rb')
def concatenate(out_file_name, all_in_file_names): """ Concatenate a list of bam files into a final output file """ # Filter out empty BAM files -- these cause samtools cat to generate # a BAM with a premature end block in_file_names = [f for f in all_in_file_names if not bam_is_empty(f)] if len(in_file_names) > 1: args = ['samtools', 'cat', '-o', out_file_name] args.extend(in_file_names) log_subprocess.check_call(args) elif len(in_file_names) == 0: # If all the BAMs are empty, just copy 1 over shutil.copy(all_in_file_names[0], out_file_name) else: shutil.copy(in_file_names[0], out_file_name)
def _merge_by_tag(output_bam, input_bams, tag, name=False, threads=1): # Note the original samtools merge call can # fail if the total length of the command line # gets too long. Use the -b option and pass # the input bam names as a file fofn = output_bam + ".fofn" if os.path.exists(fofn): raise RuntimeError("{} already exists".format(fofn)) with open(fofn, "w") as fh: fh.write("\n".join(input_bams)) args = ["samtools", "merge", "-c", "-p", "-s", "0"] if threads > 1: # the -@ specifies additional threads args.extend(["-@", str(threads - 1)]) if name: args.append("-n") if tag is not None: args.extend(["-t", str(tag)]) args.extend(["-b", fofn, output_bam]) log_subprocess.check_call(args) os.remove(fofn)
def sort_unique_tabix_vcf(vcf): ''' Sort, uniqueify, non-destructively bgzip, and tabix a VCF.''' tmp = vcf.rstrip('.vcf') + '.tmp.sorted.unique.vcf' log_subprocess.check_call('cat {0} | vcfstreamsort | vcfuniq > {1}'.format( vcf, tmp), shell=True) subprocess.check_call('cp {0} {1}'.format(tmp, vcf), shell=True) subprocess.check_call('rm -f {0}'.format(tmp), shell=True) log_subprocess.check_call("bgzip -c {0} > {0}.gz".format(vcf), shell=True) log_subprocess.check_call("tabix -p vcf {0}.gz".format(vcf), shell=True)
def bwa_index_ref(ref_fasta): """ Creates index of reference for bwa. ref_fasta should be path to the reference fasta Only needs to be called once per reference. Creates index files in the same directory as the reference """ log_subprocess.check_call(['bwa', 'index', '-a', 'bwtsw', ref_fasta])
def bwa_align_unpaired(ref_fasta, read_fastq, out_name, algorithm='ALN', max_hits=None, read_group_header=None, num_threads=24): """ Runs bwa aligner on reads without using paired-information (using bam as input format). """ assert (type(read_fastq) != list) if read_group_header is None: read_group_header = tk_bam.make_rg_header() if algorithm == 'MEM': # Temp file names sam_name = out_name + '.sam' sam_out_file = open(sam_name, 'w') log_subprocess.check_call([ 'bwa', 'mem', '-t', str(num_threads), '-M', '-R', read_group_header, ref_fasta, read_fastq ], stdout=sam_out_file) sam_out_file.close() # Create final bam file from the sam file tk_bam.convert_to_bam(sam_name, out_name) # Remove temp files subprocess.check_call(['rm', sam_name]) elif algorithm == 'ALN': # Temp file names sam_name = out_name + '.sam' index_name = out_name + '.sai' sam_out_file = open(sam_name, 'w') index_file = open(index_name, 'w') log_subprocess.check_call( ['bwa', 'aln', '-t', str(num_threads), ref_fasta, read_fastq], stdout=index_file) index_file.close() if max_hits: log_subprocess.check_call([ 'bwa', 'samse', '-n', str(max_hits), ref_fasta, index_name, read_fastq ], stdout=sam_out_file) else: log_subprocess.check_call( ['bwa', 'samse', ref_fasta, index_name, read_fastq], stdout=sam_out_file) sam_out_file.close() # Create final bam file from the sam file tk_bam.convert_to_bam(sam_name, out_name) # Remove temp files subprocess.check_call(['rm', index_name]) subprocess.check_call(['rm', sam_name]) else: raise NotSupportedException('Unsupported bwa algorithm: ' + algorithm)
def bwa_align_paired(ref_fasta, read_fastq, out_name, algorithm='ALN', max_hits=None, read_group_header=None, num_threads=24): """Runs bwa paired-end aligner on reads using paired-end information Algorithm choices are currently MEM: Maximal Exact Matching (better for longer reads) ALN: Better for longer reads Haven't yet implemented BWA-SW Currently assumes the input read_fastq is in interleaved format, i.e. the reads of a pair are alternating. """ if read_group_header is None: read_group_header = tk_bam.make_rg_header() if algorithm == 'MEM': if type(read_fastq) == list: assert (len(read_fastq) == 2) ## This restricts to primary alignments only out_file = open(out_name, 'w') ps = log_subprocess.Popen([ 'bwa', 'mem', '-t', str(num_threads), '-M', '-R', read_group_header, ref_fasta, read_fastq[0], read_fastq[1] ], stdout=subprocess.PIPE) #log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file) # restore once bug fixed errors_file = open(out_name + '_ERRORS', 'w') log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file, stderr=errors_file) out_file.close() errors_file.close() else: ## This restricts to primary alignments only out_file = open(out_name, 'w') ps = log_subprocess.Popen([ 'bwa', 'mem', '-p', '-t', str(num_threads), '-M', '-R', read_group_header, ref_fasta, read_fastq ], stdout=subprocess.PIPE) #log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file) # restore once bug fixed errors_file = open(out_name + '_ERRORS', 'w') log_subprocess.check_call(['samtools', 'view', '-bSh', '-'], stdin=ps.stdout, stdout=out_file, stderr=errors_file) out_file.close() errors_file.close() elif algorithm == 'ALN': # Temp file names temp_fastq_name1 = out_name + '1.fastq' temp_fastq_name2 = out_name + '2.fastq' index_name_1 = out_name + '1.sai' index_name_2 = out_name + '2.sai' sam_name = out_name + '.sam' # Create the temp non-interleaved files in_fastq = open(read_fastq, 'r') temp_fastq1 = open(temp_fastq_name1, 'w') temp_fastq2 = open(temp_fastq_name2, 'w') tk_fasta.uninterleave_fastq(in_fastq, temp_fastq1, temp_fastq2) temp_fastq1.close() temp_fastq2.close() # Create the bwa index files index_file_1 = open(index_name_1, 'w') index_file_2 = open(index_name_2, 'w') log_subprocess.check_call([ 'bwa', 'aln', '-t', str(num_threads), ref_fasta, temp_fastq_name1 ], stdout=index_file_1) log_subprocess.check_call([ 'bwa', 'aln', '-t', str(num_threads), ref_fasta, temp_fastq_name2 ], stdout=index_file_2) index_file_1.close() index_file_2.close() # Create the sorted SAM file sam_out_file = open(sam_name, 'w') if max_hits: log_subprocess.check_call([ 'bwa', 'sampe', '-n', str(max_hits), ref_fasta, index_name_1, index_name_2, temp_fastq_name1, temp_fastq_name2 ], stdout=sam_out_file) else: log_subprocess.check_call([ 'bwa', 'sampe', ref_fasta, index_name_1, index_name_2, temp_fastq_name1, temp_fastq_name2 ], stdout=sam_out_file) sam_out_file.close() # Create final bam file from the sam file tk_bam.convert_to_bam(sam_name, out_name) # Clean up temporary files subprocess.check_call(['rm', temp_fastq_name1]) subprocess.check_call(['rm', temp_fastq_name2]) subprocess.check_call(['rm', index_name_1]) subprocess.check_call(['rm', index_name_2]) subprocess.check_call(['rm', sam_name]) else: raise NotSupportedException('Unsupported bwa algorithm: ' + algorithm)
def remove_dups(in_name, out_name): """ remove paired-end duplicates using samtools """ log_subprocess.check_call(['samtools', 'rmdup', in_name, out_name])
def merge_by_name(out_file_name, in_file_names): """ Merge name-sorted bam files into bam file sorted by name""" args = ['samtools', 'merge', '-n', out_file_name] args.extend(in_file_names) log_subprocess.check_call(args)
def split_alt_alleles_vcf(vcf_path, out_path): """ Splits records with more than one ALT field into two """ out_file = open(out_path, 'w') log_subprocess.check_call(['vcfbreakmulti', vcf_path], stdout=out_file)
def output_primitives_vcf(vcf_path, out_path): """ Decomposes all complex variants into SNP and indel primitives """ out_file = open(out_path, 'w') log_subprocess.check_call(['vcfallelicprimitives', vcf_path], stdout=out_file)