def run_mutect_paired( normal_bam_file, tumour_bam_file, ref_genome_fasta_file, region, out_file, normal_name='normal', tumour_name='tumour'): cmd = [ 'gatk', 'Mutect2', '-R', ref_genome_fasta_file, '-I', tumour_bam_file, '-tumor', tumour_name, '-I', normal_bam_file, '-normal', normal_name, '-L', region, '-O', out_file ] cli.execute(*cmd) idx_file = out_file + '.idx' if os.path.exists(idx_file): os.unlink(idx_file)
def rename_chroms(chrom_map_file, in_file, out_file): if not os.path.exists(in_file + '.csi'): tmp_index = True cmd = ['bcftools', 'index', in_file] cli.execute(*cmd) else: tmp_index = False cmd = [ 'bcftools', 'annotate', '--rename-chrs', chrom_map_file, in_file, '|', 'bcftools', 'view' ] out_file_type = _detect_file_type(out_file) if out_file_type == 'bcf': cmd.extend(['-O', 'b']) else: cmd.extend(['-O', 'z']) cmd.extend(['>', out_file]) cli.execute(*cmd) if tmp_index: os.unlink(in_file + '.csi')
def split_vcf(in_file, out_file_callback, tmp_dir, split_size=int(1e5)): if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) tmp_file = os.path.join(tmp_dir, os.path.basename(in_file)) os.link(in_file, tmp_file) cmd = [ 'SnpSift', 'split', '-l', split_size, tmp_file, ] cli.execute(*cmd) prefix = tmp_file.split('.')[0] os.unlink(tmp_file) tmp_split_files = sorted(glob.glob(prefix + '*.vcf')) for idx, tmp_out_file in enumerate(tmp_split_files): out_file = out_file_callback[idx] shutil.move(tmp_out_file, out_file) shutil.rmtree(tmp_dir)
def markdups(in_files, out_file, tmp_dir, threads=1): """ Merge files and mark duplicate reads in a file. :param in_files: The path of a BAM file or dictionary with values being paths of BAM files. If a dictionary is passed the keys are ignored. The BAM files must be coordinate sorted. :param out_file: Path where merged and duplicate marked BAM file will be written. :param tmp_dir: Path where a directory will be created to store temporary files. :param threads: Number of threads to use. """ if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) cmd = [ 'sambamba', 'markdup', '-t', threads, '--tmpdir', tmp_dir, '--sort-buffer-size', 2048, '--io-buffer-size', 640, '--overflow-list-size', 1000000, '--hash-table-size', 262144 ] cmd.extend(flatten_input(in_files)) cmd.append(out_file) cli.execute(*cmd) shutil.rmtree(tmp_dir)
def build_coverage_file(normal_wig, tumour_wig, gc_wig, mappability_wig, out_file, target_file=None): script = pkg_resources.resource_filename( 'soil', 'wrappers/titan/scripts/build_titan_coverage.py') cmd = [ 'python', script, '-n', normal_wig, '-t', tumour_wig, '-g', gc_wig, '-m', mappability_wig, '-o', out_file, ] if target_file is not None: cmd.extend(['--target-bed-file', target_file]) cli.execute(*cmd)
def bam_readcount(config, bam_type, bam, result, tmp_file): execute( 'bam-readcount', '-d', 50000, '-w', 0, '--min-mapping-quality', config['map_q'], '--min-base-quality', config['base_q'], '-f', config['reference_genome'], bam, result['chr'] + ":" + result['pos'] + "-" + result['pos'], '>', tmp_file, ) with open(tmp_file, 'rb') as temp: for row in temp: read_counts = row.split() result[bam_type + '_coverage'] = float(read_counts[3]) for nucleotide_count in read_counts[5:10]: nucleotide_count_split = nucleotide_count.split(":") result[bam_type + '_' + nucleotide_count_split[0]] = float( nucleotide_count_split[1]) result[bam_type + '_vaf'] = result[bam_type + '_' + result['alt']] / result[bam_type + '_coverage']
def logd(log_density_csv, trace_log_density_pdf, docker_image=None): execute('logd.R', '-f', log_density_csv, '-o', trace_log_density_pdf, docker_image=docker_image)
def run_feature_counts(in_file, gene_gtf_file, out_file, tmp_dir): """ Run featureCounts command to get the number of reads covering genes. :param in_file: Path to WTS BAM file to count reads from. :param gene_gtf_file: Path to file with gene annotations in GTF format. :out_file: Path where output will be written in tsv format. :tmp_dir: Temporary director for featureCounts. Will be removed. """ if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) tmp_out_file = os.path.join(tmp_dir, 'counts.tsv') cmd = [ 'featureCounts', '-a', gene_gtf_file, '-o', tmp_out_file, in_file ] cli.execute(*cmd) shutil.move(tmp_out_file, out_file) shutil.rmtree(tmp_dir)
def run_transdecoder(in_file, out_cds_fasta_file, out_protein_fasta_file, out_gff_file, tmp_dir): if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) tmp_in_file = os.path.join(tmp_dir, 'cdna.fasta') shutil.copyfile(in_file, tmp_in_file) old_wd = os.getcwd() os.chdir(tmp_dir) cmd = ['TransDecoder.LongOrfs', '-t', tmp_in_file] cli.execute(*cmd) cmd = ['TransDecoder.Predict', '-t', tmp_in_file] cli.execute(*cmd) os.chdir(old_wd) shutil.move(tmp_in_file + '.transdecoder.cds', out_cds_fasta_file) shutil.move(tmp_in_file + '.transdecoder.pep', out_protein_fasta_file) shutil.move(tmp_in_file + '.transdecoder.gff3', out_gff_file) shutil.rmtree(tmp_dir)
def LoLoPicker_somatic(config, tumour_bam, normal_bam, region_bed, temp_dir, somatic_file): os.makedirs(temp_dir) execute( 'LoLoPicker_somatic.py', '--tumoralteredratio', config['T_vaf_threshold'], '--normalalteredratio', config['N_vaf_threshold'], '--basequality', config['base_q'], '--mappingquality', config['map_q'], '-t', tumour_bam, '-n', normal_bam, '-r', config['reference_genome'], '-b', region_bed, '-o', temp_dir, ) copyfile(os.path.join(temp_dir, 'raw_somatic_varants.txt'), somatic_file)
def annotate_outputs(config, temp_space, input_file, output_txt): os.makedirs(temp_space) execute(os.path.join(config['annovar'], 'convert2annovar.pl'), '-format', 'vcf4', '-allsample', '-withfreq', input_file, '>', os.path.join(temp_space, 'anno_in')) execute( os.path.join(config['annovar'], 'table_annovar.pl'), os.path.join(temp_space, 'anno_in'), config['annovar_humandb'], '-buildver', 'hg19', '-out', os.path.join(temp_space, 'ANNO'), '-remove', '-protocol', 'refGene,cytoBand', '-operation', 'g,r', '-nastring', '.', '-polish', ) copyfile(os.path.join(temp_space, 'ANNO.hg19_multianno.txt'), output_txt)
def data_exploration(straighten_jitter, output, docker_image=None): execute('data_exploration.R', '-f', straighten_jitter, '-o', output, docker_image=docker_image)
def convert_msgf_to_pin(in_decoy_files, in_target_files, out_file, tmp_space): def link_tmp_files(file_list, meta_file, prefix): tmp_files = [] for split_id, file_name in file_list.items(): tmp_file = os.path.join(tmp_space, '{0}_{1}.mzid'.format(prefix, split_id)) os.link(file_name, tmp_file) tmp_files.append(tmp_file) with open(meta_file, 'w') as fh: fh.write('\n'.join(tmp_files)) if os.path.exists(tmp_space): shutil.rmtree(tmp_space) os.makedirs(tmp_space) decoy_meta_file = os.path.join(tmp_space, 'decoy.txt') link_tmp_files(in_decoy_files, decoy_meta_file, 'decoy') target_meta_file = os.path.join(tmp_space, 'target.txt') link_tmp_files(in_target_files, target_meta_file, 'target') cmd = ['msgf2pin', target_meta_file, decoy_meta_file, '-o', out_file] cli.execute(*cmd) shutil.rmtree(tmp_space)
def trim_fastq(fastq_1, fastq_2, temp_dir, output_fastq_1, output_fastq_2): os.makedirs(temp_dir) execute( 'trim_galore', '--paired', '--output_dir', temp_dir, '--clip_R1', 13, '--three_prime_clip_R1', 13, '--clip_R2', 13, '--three_prime_clip_R2', 13, fastq_1, fastq_2 ) trimmed_fastq_1 = os.path.basename(fastq_1).replace(".fastq.gz", "_val_1.fq.gz") trimmed_fastq_2 = os.path.basename(fastq_2).replace(".fastq.gz", "_val_2.fq.gz") copyfile(os.path.join(temp_dir, trimmed_fastq_1), output_fastq_1) copyfile(os.path.join(temp_dir, trimmed_fastq_2), output_fastq_2)
def index_bam(input_bam, output_bai): execute( 'samtools', 'index', input_bam, output_bai )
def count_fasta_bases(ref_genome_fasta_file, out_file): share_dir = os.path.join(os.environ['CONDA_PREFIX'], 'share') exe = soil.utils.file_system.find('countFastaBases', share_dir) cmd = [exe, ref_genome_fasta_file, '>', out_file] cli.execute(*cmd)
def configure_iedb_module(in_sentinel, out_sentinel): iedb_dir = os.path.dirname(in_sentinel) os.chdir(iedb_dir) cli.execute('tcsh', 'configure') open(out_sentinel, 'w').close()
def sort_bam(input_bam, output_sorted_bam): execute( 'samtools', 'sort', input_bam, '-o', output_sorted_bam )
def error_rates_viz(fnr_csv, trace_fnr_pdf, box_fnr_pdf, docker_image=None): execute('error_rates_viz.R', '-f', fnr_csv, '-t', trace_fnr_pdf, '-b', box_fnr_pdf, docker_image=docker_image)
def convert_gtf_to_gff_file(in_file, out_file): opt_dir = os.path.join(os.environ['CONDA_PREFIX'], 'opt') gtf_to_gff_script = soil.utils.file_system.find( 'cufflinks_gtf_to_alignment_gff3.pl', opt_dir) cmd = ['perl', gtf_to_gff_script, in_file, '>', out_file] cli.execute(*cmd)
def sam_to_bam(input_sam, output_bam): execute( 'samtools', 'view', '-bS', input_sam, '-o', output_bam )
def vcf_annotate_outputs(config, temp_space, input_file, output_vcf): os.makedirs(temp_space) execute(os.path.join(config['annovar'], 'table_annovar.pl'), input_file, config['annovar_humandb'], '-buildver', 'hg19', '-out', os.path.join(temp_space, 'ANNO'), '-remove', '-protocol', 'refGene,cytoBand', '-operation', 'g,r', '-nastring', '.', '-polish', '-vcfinput') copyfile(os.path.join(temp_space, 'ANNO.hg19_multianno.vcf'), output_vcf)
def fastq_to_sam(ref_genome, fastq_1, fastq_2, output_sam): execute( 'bwa', 'mem', ref_genome, fastq_1, fastq_2, '>', output_sam )
def decompress(in_file, out_file): file_type = _guess_file_type(in_file) if file_type == 'gz': cmd = ['gzip', '-cd', in_file, '>', out_file] else: cmd = ['cp', in_file, out_file] cli.execute(*cmd)
def compress_vcf(in_file, out_file, index_file=None): """ Compress a VCF file using bgzip. :param in_file: Path of uncompressed VCF file. :param out_file: Path were compressed VCF file will be written. """ cli.execute('bgzip', '-c', in_file, '>', out_file) if index_file is not None: index_vcf(out_file, index_file=index_file)
def merge_normal(config, input_bams, output_file, output_bai): normal_list = list(bam for bam in input_bams.itervalues()) cmd = ['samtools', 'merge', '-f', output_file] cmd.extend(normal_list) execute(*cmd) execute( 'samtools', 'index', output_file, output_bai, )
def run_pileup2snp(in_file, out_file): cmd = [ 'varscan', 'pileup2snp', in_file, '|', 'gzip', '>', out_file ] cli.execute(*cmd)
def mpileup2snp(in_file, out_file): if os.stat(in_file).st_size == 0: cmd = ['echo', 'a'] else: cmd = ['cat', in_file] cmd.extend( ['|' 'varscan', 'mpileup2snp', '--output-vcf', 1, '>', out_file]) cli.execute(*cmd)
def build_alignment_gff(cdna_fasta_file, gff_file, ref_gff_file, out_file): opt_dir = os.path.join(os.environ['CONDA_PREFIX'], 'opt') cdna_to_genome_script = soil.utils.file_system.find( 'cdna_alignment_orf_to_genome_orf.pl', opt_dir) cmd = [ 'perl', cdna_to_genome_script, gff_file, ref_gff_file, cdna_fasta_file, '>', out_file ] cli.execute(*cmd)
def convert_gtf_to_cdna_fasta(in_file, ref_genome_fasta_file, out_file): opt_dir = os.path.join(os.environ['CONDA_PREFIX'], 'opt') gtf_to_fasta_script = soil.utils.file_system.find( 'cufflinks_gtf_genome_to_cdna_fasta.pl', opt_dir) cmd = [ 'perl', gtf_to_fasta_script, in_file, ref_genome_fasta_file, '>', out_file ] cli.execute(*cmd)