def run_bcftools_mpileup(work_dir, ref_fa_file, THREADS, SS_dir, suffix): ''' Runs bcftools mpileup -O = output base positions on reads -u = generate uncompressed VCF/BCF output -f = faidx indexed reference sequence file param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str THREADS = number of threads available param: str SS_dir = species-specific directory, e.g.: 'Lpn/' param: str ref_fa_file = name of a reference strain's FASTA file param: str suffix = distinguishes files if more than one reference was used for read mapping return: ReturnCode, StdOut, StdErr output: 'mpileup' + suffix + '.bcf ' file ''' print('\nrunning: BCFtools mpileup') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH\ + ':' + BCFtools_WorkingDir + '" '\ + '-i ' + BCFtools_image + ' bcftools mpileup '\ + '-Ou '\ + '--threads ' + THREADS + ' '\ + '-f ' + REF_dir + SS_dir + ref_fa_file + ' '\ + '-o ' + TEMP_dir + work_dir + 'mpileup' + suffix + '.bcf '\ + TEMP_dir + work_dir + 'marked_duplicates' + suffix + '.bam' ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nbcftools mpileup:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_mark_duplicates(work_dir, suffix): """ Runs picard MarkDuplicatest to marks duplicate reads in the BAM file, which are subsequently ignored by downstream applications. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str suffix = distinguishes files if more than one reference was used for read mapping return: ReturnCode, StdOut, StdErr output: marked_dup_metrics' + suffix + '.txt' file """ print('\nrunning: Picard MarkDuplicates') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir\ + ':' + Picard_WorkingDir + '" '\ + '-i ' + Picard_image + ' MarkDuplicates '\ + 'I=bwa_mapped' + suffix + '.bam ' \ + 'O=marked_duplicates' + suffix + '.bam ' \ + 'M=marked_dup_metrics' + suffix + '.txt' ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\npicard MarkDuplicates:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_samtools_index(work_dir, THREADS, suffix, bam_file): ''' Indexes the reads in the sorted 'aln.bam' file. Usage: samtools index [-bc] [-m INT] <in.bam> [out.index] -@ INT Sets the number of threads [none] param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param str THREADS = number of threads available param: str suffix = distinguishes files if more than one reference was used for read mapping param: str bam_file = name of the input BAM file return: ReturnCode, StdOut, StdErr output: index files ''' print('\nrunning: Samtools index') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir\ + ':' + Samtools_WorkingDir + '" '\ + '-i ' + Samtools_image + ' samtools index '\ + '-@ ' + THREADS + ' '\ + bam_file ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nsamtools index:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_samtools_sort(work_dir, THREADS, suffix): ''' Sorts an alignment file. Usage: samtools sort [options...] [in.bam] -o FILE Write final output to FILE rather than standard output --threads INT Number of additional threads to use [0] param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str THREADS = number of threads available param: str suffix = distinguishes files if more than one reference was used for read mapping return: ReturnCode, StdOut, StdErr output: 'bwa_mapped' + suffix + '.bam' file, where suffix = '_1', '_2', ... ''' print('\nrunning: Samtools sort') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir\ + ':' + Samtools_WorkingDir + '" '\ + '-i ' + Samtools_image + ' samtools sort '\ + '--threads ' + THREADS + ' '\ + '-o bwa_mapped' + suffix + '.bam '\ + 'bwa_mapped' + suffix + '.sam' ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nsamtools sort:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_bcftools_view(THREADS, work_dir, suffix): ''' Converts a bcf file, such as 'mpileup.bcf', into a VCF file. param str THREADS = number of threads available param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str suffix = distinguishes files if more than one reference was used for read mapping return: ReturnCode, StdOut, StdErr output: 'mpileup' + suffix + '.vcf' file ''' print('\nrunning: BCFtools view') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir\ + ':' + BCFtools_WorkingDir + '" '\ + '-i ' + BCFtools_image + ' bcftools view '\ + '--threads ' + THREADS + ' '\ + '-o mpileup' + suffix + '.vcf '\ + 'mpileup' + suffix + '.bcf' ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nbcftools view:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_bwa_mem(work_dir, THREADS, SS_dir, ref_fa_file, suffix): ''' Mapping of reads to a reference genome. Usage: bwa mem [options] <idxbase> <in1.fq> [in2.fq] -t INT number of threads [1] -o FILE sam file to output results to [stdout] param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str THREADS = number of threads available param: str SS_dir = species-specific directory, e.g.: 'Lpn/' param: str ref_fa_file = name of a reference strain's FASTA file param: str suffix = distinguishes files if more than one reference was used for read mapping return: ReturnCode, StdOut, StdErr output: 'bwa_mapped' + suffix + '.sam' file, where suffix = '_1', '_2', ... ''' print('\nrunning: BWA mem') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH\ + ':' + BWA_WorkingDir + '" '\ + '-i ' + BWA_image + ' bwa mem '\ + '-t ' + THREADS + ' '\ + REF_dir + SS_dir + ref_fa_file + ' '\ + TEMP_dir + work_dir + 'paired_reads_1.fq '\ + TEMP_dir + work_dir + 'paired_reads_2.fq '\ + '-o ' + TEMP_dir + work_dir + 'bwa_mapped' + suffix + '.sam' ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nBWA MEM:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_Kraken(work_dir): ''' Runs Minikraken to classify contigs by species. Output is a number for the classification and kmer counts, which needs to translated into human- readable form. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' return: ReturnCode, StdOut, StdErr output: 'kraken_out.txt' file ''' print('\nrunning: Kraken') # that's the database that comes with the docker image KRAKEN_DATABASE = '/kraken-database/minikraken_20171013_4GB' command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + OUTPUT_dir + work_dir\ + ':' + Kraken_WorkingDir + '" '\ + '-i ' + Kraken_image + ' kraken '\ + '--preload --db ' + KRAKEN_DATABASE + ' '\ + 'SPAdes_contigs.fa '\ + '--output kraken_out.txt' with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file: print('\nKraken:\n', command, file=log_file) # the first first param is a '' instead of 'work_dir' because the log.txt # file has been moved from /temp/ to /output/ # see toolshed.run_subprocess() ReturnCode, StdOut, StdErr = toolshed.run_subprocess('', command, True) with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file: print('\nKraken:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_nw_display(seed, work_dir): ''' Generates an ASCII-based tree for the report. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str seed = combination of sp_abbr and reference, e.g.: 'Lpn/F4468/' output: a tree, in ASCII format, added to the report.txt file ''' print('\nrunning: NW_display') # Note that evolbioinfo/newick_utilities:v1.6 uses "WorkingDir": "" # Note: no need to call up 'nw_diplay' command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + 'parsnp/' + seed\ + ':' + NU_WorkingDir + '" '\ + '-i ' + NU_image + ' '\ + 'parsnp.tree' with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file: print('\nNewick display:\n', command, file=log_file) ReturnCode, StdOut, StdErr = toolshed.run_subprocess('', command, True) with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file: print('\nNewick display:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_freebayes(work_dir, ref_fa_file, SS_dir, suffix): ''' Runs FreeBayes to call SNPs, INDELs, and complex mutations. -p INT ploidy of the organism -f FILE Use FILE as the reference sequence for analysis. An index file (FILE.fai) will be created if none exists. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str ref_fa_file = name of a reference strain's FASTA file param: str SS_dir = species-specific directory, e.g.: 'Lpn/' param: str suffix = distinguishes files if more than one reference was used for read mapping output: a VCF-file, 'freebayes_all.vcf' ''' print('\nrunning: Freebayes') command = 'docker run'\ + ' -v "' + BASE_PATH\ + ':' + Freebayes_WorkingDir + '" '\ + '-i ' + Freebayes_image + ' freebayes '\ + '-f ' + REF_dir + SS_dir + ref_fa_file + ' '\ + '-p 1 '\ + TEMP_dir + work_dir + 'marked_duplicates' + suffix + '.bam '\ + '> ' + BASE_PATH + TEMP_dir + work_dir + 'freebayes_all.vcf' ReturnCode, StdOut, StdErr = toolshed.run_subprocess( work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nFreebayes:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_mash_dist(work_dir, ref_msh_file, query_msh_file, suffix): """ Returns the distance between the references and the query param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str ref_msh_file = reference sketch file param: str query_msh_file = query sketch file param: str suffix = 'FAvNCBI' or 'RvSp' output: 'distances_' + suffix + '.tab' file, suffix = 'FAvNCBI' or 'RvSp' """ print('\nrunning: Mash dist') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH\ + ':' + Mash_WorkingDir + '" '\ + '-i ' + Mash_image + ' mash dist '\ + ref_msh_file + ' ' + query_msh_file + ' '\ + '> ' + BASE_PATH + TEMP_dir + work_dir + 'distances_'\ + suffix + '.tab' print('\n## Running:\n', command, '\n') # execute 'mash dist' and write results to file ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nMash distance:\n', StdOut, file=log_file)
def run_quast(work_dir, SS_dir, ref_fa_file, check_seq_file): ''' Runs Quast, a quality assessment tool for assemblies. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str SS_dir = species-specific directory, e.g.: 'Lpn/' param: str ref_fa_file = name of a reference strain's FASTA file param: str check_seq_file = name of a sequence file to be QC'd output: Quast generates a number of files that will be deposited in the new 'temp/Quast/' folder ''' print('\nrunning: Quast') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH +\ ':' + Quast_WorkingDir + '" '\ + '-i ' + Quast_image + ' quast.py '\ + '-o temp/' + work_dir + 'quast/ '\ + '-R ' + REF_dir + SS_dir + ref_fa_file\ + ' --fast '\ + TEMP_dir + work_dir + check_seq_file ReturnCode, StdOut, StdErr = toolshed.run_subprocess( work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nBWA index:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_Kraken_translate(work_dir): ''' Converts the initial Kraken output into human-readable form. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' return: ReturnCode, StdOut, StdErr ''' print('\nrunning: Kraken-translate') # that's the database that comes with the docker image KRAKEN_DATABASE = '/kraken-database/minikraken_20171013_4GB' command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + OUTPUT_dir + work_dir\ + ':' + Kraken_WorkingDir + '" '\ + '-i ' + Kraken_image + ' kraken-translate '\ + '--db ' + KRAKEN_DATABASE + ' '\ + 'kraken_out.txt '\ + '> ' + BASE_PATH + OUTPUT_dir + work_dir + 'kraken_res.txt' with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file: print('\nKraken-translate:\n', command, file=log_file) ReturnCode, StdOut, StdErr = toolshed.run_subprocess('', command, True) with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file: print('\nKraken-translate:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_samtools_faidx(work_dir, SS_dir, ref_fa_file): ''' Generates a FAI index file, required for FreeBayes. Usage: samtools faidx <file.fa|file.fa.gz> [<reg> [...]] param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str SS_dir = species-specific directory, e.g.: 'Lpn/' param: str ref_fa_file = name of a reference strain's FASTA file return: ReturnCode, StdOut, StdErr output: index files ''' print('\nrunning: Samtools faidx') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + REF_dir + SS_dir\ + ':' + Samtools_WorkingDir + '" '\ + '-i ' + Samtools_image + ' samtools faidx '\ + ref_fa_file ReturnCode, StdOut, StdErr = toolshed.run_subprocess( work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nsamtools faidx:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_fastqc(work_dir, proc_reads): """ Runs FastQC on a (processed) read file. -d DIR directory for temporary files when generating report images (default: '?') param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str proc_reads = name of file with forward or reverse reads processed by Trimmomatic output: FastQC files 'read_file_fastqc.html' and 'read_file_fastqc.zip' """ print('\nrunning: FastQC') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir \ + ':' + FastQC_WorkingDir + '" '\ + '-i ' + FastQC_image + ' fastqc '\ + '-d temp/ '\ + proc_reads ReturnCode, StdOut, StdErr = toolshed.run_subprocess( work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nFastQC:\n', StdOut, file=log_file)
def run_bwa_index(work_dir, SS_dir, ref_fa_file): ''' Creates an index ('.amb', '.ann', '.bwt', '.pac', '.sa') for a fasta file. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str SS_dir = species-specific directory, e.g.: 'Lpn/' param: str ref_fa_file = name of a reference strain's FASTA file return: ReturnCode, StdOut, StdErr output: index files ''' print('\nrunning: BWA index') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + REF_dir + SS_dir\ + ':' + BWA_WorkingDir + '" '\ + '-i ' + BWA_image + ' bwa index '\ + ref_fa_file ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nBWA index:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_samtools_depth(work_dir, THREADS, suffix): ''' runs: samtools depth -aa output absolutely all positions param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str THREADS = number of threads available param: str suffix = distinguishes files if more than one reference was used for read mapping return: ReturnCode, StdOut, StdErr output: 'samtools_depth' + suffix + '.txt' file ''' print('\nrunning: Samtools depth') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir\ + ':' + Samtools_WorkingDir + '" '\ + '-i ' + Samtools_image + ' samtools depth '\ + '-aa '\ + 'marked_duplicates' + suffix + '.bam '\ + '> ' + BASE_PATH + TEMP_dir + work_dir\ + 'temp/samtools_depth' + suffix + '.txt' ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nsamtools depth:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_parsnp(THREADS, work_dir, seed, isolate, include_all): ''' Runs Parsnp for core-genome alignment and analysis. parsnp accepts single- and multi-fasta files containing 'ACGTN' or 'acgtn' or a mix -r REF = specify the reference genome for Parsnp: either the isolate for new references or the mapping reference -o DIR = output directory; default [./P_CURRDATE_CURRTIME] -c = forces inclusion of all genomes in a given directory; remove to exclude strains that are too distant, which can cause Parsnp to fail -d DIR = directory containing genomes/contigs/scaffolds; Note: no '/' needed after DIR, added automatically -v FLAG = verbose output? (default = NO) -p INT = number of threads to use? (default= 1) param: str THREADS = number of threads available param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str seed = combination of sp_abbr and reference, e.g.: 'Lpn/F4468/' param: str isolate = isolate name, e.g.: 'IDR001234' param: bool include_all = if True, forces the inclusion of all genomes in a given directory, might lead to a crash of Parsnp if a genome is too distant; if False, uses only similar genomes, which might exclude genomes of interest ''' print('\nrunning: Parsnp') # select a reference genome, e.g. 'ref1.fa' for 'Aba/ref1/' or 'iso1.fa' # if 'iso1' is a new reference parsnp_reference = seed.split('/')[-2] + '.fa' if 'All_refs' in seed: parsnp_reference = isolate + '.fa' # force inclusion of all genomes force_all = ' ' if include_all: force_all = '-c ' command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH\ + ':' + Parsnp_WorkingDir + '" '\ + '-i ' + Parsnp_image + ' parsnp '\ + '-d ' + GENOMES_dir + seed + ' '\ + '-r ' + GENOMES_dir + seed + parsnp_reference + ' '\ + '-o ' + TEMP_dir + 'parsnp/' + seed + ' '\ + force_all\ + '-v -p ' + THREADS with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file: print('\nParsnp:\n', command, file=log_file) ReturnCode, StdOut, StdErr = toolshed.run_subprocess('', command, True) with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file: print('\nParsnp:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_spades(work_dir, THREADS, MEMORY, max_read_len): ''' de novo genome assembler usage: spades.py [options] -o <out_dir> -o <out _dir> directory to store all the resulting files (required) -1 <filename> file with forward paired-end reads -2 <filename> file with reverse paired-end reads -t <int> number of threads. [default: 16] -m <int> RAM limit for SPAdes in Gb (terminates if exceeded). [default: 250] -k <int,int,...> Comma-separated list of k-mer sizes to be used for 250bp reads; use "-k 21,33,55,77" for 150bp reads --careful tries to reduce number of mismatches and short indels --cov-cutoff Read coverage cutoff value. Must be a positive float value, or 'auto', or 'off'. When 'auto': SPAdes automatically computes coverage threshold using conservative strategy param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str THREADS = number of threads available: + '-t ' + THREADS + ' '\ param: str MEMORY = available memory: + '-m ' + MEMORY + ' '\ param: int max_read_len = length of largest read, important for selecting the size of kmers return: ReturnCode, StdOut, StdErr output: folder with results ''' print('\nrunning: SPAdes\n') if max_read_len > 175: k_param = ' -k 21,33,55,77,99,127' else: k_param = ' -k 21,33,55,77' command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir\ + ':' + SPAdes_WorkingDir + '" '\ + '-i ' + SPAdes_image + ' spades.py '\ + '-1 paired_reads_1.fq '\ + '-2 paired_reads_2.fq '\ + k_param\ + ' --careful --cov-cutoff auto '\ + '-o SPAdes' ReturnCode, StdOut, StdErr = toolshed.run_subprocess( work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nSPAdes, all reads (abbreviated):\n', StdOut.replace('\t', ' ').replace('\n', ' ')[:700], file=log_file) return ReturnCode, StdOut, StdErr
def run_mash_sketch(active_folder, work_dir, out_file, in_data, in_data_type=''): """ Runs Mash sketch on one or more FASTQ or FASTA files '.msh' will be added automatically to out_file param: str active_folder = path to one of three possible folders param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str out_file = name of the mash sketch output file param: list in_data = list of one or more FASTA file(s) param: str in_data_type = determines which sketch options to use output: a MSH sketch file for the input sequences """ print('\nrunning: Mash sketch') # parameters for running Mash sketch: few, short kmers for reads; # more, long kmers for genomes # -k = kmer size # -s = Sketch size, number of min-hashes # -m = Minimum copies of ea. kmer required to pass reads noise filter # no parameters => default settings: -k 21, -s 1000 if in_data_type == 'lo_genomes': param = '-k 16 -s 400 ' elif in_data_type == 'comb_reads': param = '-m 2 -k 16 -s 400 ' else: param = '' # one or more files that need to be sketched; multiple files are separated # by a ' ', e.g.: 'mash sketch -o outfile Lpn.fa Tmi.fa. Eco.fa' lo_files = ' '.join(in_data) command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + active_folder\ + ':' + Mash_WorkingDir + '" '\ + '-i ' + Mash_image + ' mash sketch '\ + param\ + '-o ' + out_file + ' '\ + lo_files ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nMash sketch:\n', StdOut, file=log_file)
def run_samtools_flagstat(work_dir, THREADS, suffix, MAPPED_THRESHOLD): ''' runs: samtools flagstat param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str THREADS = number of threads available param: str suffix = distinguishes files if more than one reference was used for read mapping param: int MAPPED_THRESHOLD = min percentage of mapped reads return: ReturnCode, StdOut, StdErr return: float percent_mapped = percentage of mapped reads output: text added to report ''' print('\nrunning: Samtools flagstat') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir\ + ':' + Samtools_WorkingDir + '" '\ + '-i ' + Samtools_image + ' samtools flagstat '\ + '--threads ' + THREADS + ' '\ + 'marked_duplicates' + suffix + '.bam' ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) percent_mapped = float(StdOut.split('mapped (')[1].split('%')[0]) print('percent_mapped:', percent_mapped) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nsamtools flagstat:\n', StdOut, file=log_file) with open(BASE_PATH + TEMP_dir + work_dir + 'report.txt', 'a') as report: print('\nAlignment QC (Samtools flagstat):', file=report) print(StdOut.replace('stdout:\n',''), file=report) print('\nPercentage of mapped reads:', percent_mapped, file=report) if percent_mapped <= MAPPED_THRESHOLD: print('\nNOTE:\nPercentage of mapped reads below threshold.\n'\ + 'Adding the isolate to the list of candidate reference '\ + 'genomes.', file=report) return ReturnCode, StdOut, StdErr, percent_mapped
def run_samtools_idxstats(work_dir, THREADS, suffix): ''' runs: samtools idxstats param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str THREADS = number of threads available param: str suffix = distinguishes files if more than one reference was used for read mapping return: ReturnCode, StdOut, StdErr return: float percent_mapped = percentage of mapped reads output: text added to report ''' print('\nrunning: Samtools idxstats') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir\ + ':' + Samtools_WorkingDir + '" '\ + '-i ' + Samtools_image + ' samtools idxstats '\ + '--threads ' + THREADS + ' '\ + 'marked_duplicates' + suffix + '.bam' ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) # e.g: stdout=b'NZ_CP006644.1\t6205897\t188455\t13709\nNZ_CP011450.1\ # t374401\t6147\t317\n*\t0\t0\t2900154\n') # e.g.: # NZ_CP006644.1 6205897 188455 13709 # NZ_CP011450.1 374401 6147 317 # * 0 0 2900154 # e.g.: ['NZ_CP006644.1', '6205897', '188455', '13709', 'NZ_CP011450.1', # '374401', '6147', '317', '*', '0', '0', '2900154', ''] with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nsamtools idxstats:\n', StdOut, file=log_file) with open(BASE_PATH + TEMP_dir + work_dir + 'report.txt', 'a') as report: print('\n\nAlignment QC (Samtools idxstats):', file=report) print('ref_fa_file\tlen\tmapped\tunmapped', file=report) print(StdOut.replace('stdout:\n',''), file=report) return ReturnCode, StdOut, StdErr
def run_vcffilter(work_dir, DP_max): ''' Runs vcffilter to remove low quality SNp. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: int DP_max = maximum total read depth at that SNP locus output: a VCF-file, 'freebayes.vcf' ''' print('\nrunning: vcffilter') # filtering thresholds to sort out low probability SNPs QUAL_threshold = 20 # min SNP quality (phred scale) DP_min = 10 # min Total read depth at the locus QA_threshold = 20 # min Alternate allele quality sum (phred scale) AO_DP_ratio = 0.899 # min percentage of reads supporting the SNP, where # AO is the Count of full observations of this # alternate haplotype. # hard filter implemented as per Erik Garrison (see command): # SAF > 0 & SAR > 0 # remove alleles that are only seen on one strand command = 'docker run'\ + ' -v "' + BASE_PATH\ + ':' + VCFlib_WorkingDir + '" '\ + '-i ' + VCFlib_image + ' vcffilter '\ + '-f "QUAL > ' + str(QUAL_threshold) + ' '\ + '& DP > ' + str(DP_min) + ' & DP < ' + str(DP_max) + ' '\ + '& QA > ' + str(QA_threshold) + ' '\ + '& SAF > 0 & SAR > 0 ' \ + '& AO > ' + str(AO_DP_ratio) + ' * DP" '\ + TEMP_dir + work_dir + 'freebayes_all.vcf '\ + '> ' + BASE_PATH + TEMP_dir + work_dir + 'freebayes.vcf' ReturnCode, StdOut, StdErr = toolshed.run_subprocess( work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nFreebayes:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr
def run_nw_display_svg(seed, work_dir): ''' Generates a prettier tree in SVG format. uses a css.map file to change the looks of the tree in SVG format -s = produces a pretty Scalable Vector Graphic (.svg) file for viewing in a web browser -w INT = width of the figure in pixels (when in -s mode, columns else) param: str seed = combination of sp_abbr and reference, e.g.: 'Lpn/F4468/' param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' output: tree in SVG format ''' print('\nrunning: NW_display') # Note that evolbioinfo/newick_utilities:v1.6 uses "WorkingDir": "" # Note: no need to call up 'nw_diplay' command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + 'parsnp/' + seed\ + ':' + NU_WorkingDir + '" '\ + '-i ' + NU_image + ' '\ + '-s -w 700 -b opacity:0 '\ + '-o parsnp_ornament.map '\ + 'parsnp.tree' with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file: print('\nNewick display:\n', command, file=log_file) ReturnCode, StdOut, StdErr = toolshed.run_subprocess('', command, True) tree_data = StdOut.replace('stdout:\n', '') with open(BASE_PATH + TEMP_dir + 'parsnp/' + seed + 'parsnp_tree.svg', 'w') as out_file: print(tree_data, file=out_file) with open(BASE_PATH + OUTPUT_dir + work_dir + 'log.txt', 'a') as log_file: print('\nNewick display:\n', StdErr, file=log_file) return ReturnCode, StdOut, StdErr
def run_trimmomatic(work_dir, F_READS, R_READS, THREADS, MinLen='100'): ''' Trimming of Illumina reads. PE: paired ends = two input, four output files param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param str F_READS = file with forward paired reads param str R_READS = file with reverse paired reads param str THREADS = number of threads available param str MinLen = minimum read length, default here '100' return: ReturnCode, StdOut, StdErr output: four read files: paired/unpaired and forward/reverse ''' print('\nrunning: Trimmomatic\n') INPUT_FILES = 'raw_reads_noG_1.fq '\ + 'raw_reads_noG_2.fq ' OUTPUT_FILES = 'paired_reads_1.fq '\ + 'temp/unpaired_reads_1.fq '\ + 'paired_reads_2.fq '\ + 'temp/unpaired_reads_2.fq ' command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir\ + ':' + Trimmomatic_WorkingDir + '" '\ + '-i ' + Trimmomatic_image + ' trimmomatic PE '\ + INPUT_FILES\ + OUTPUT_FILES\ + '-threads ' + THREADS + ' '\ + '-trimlog temp/trimmomatic_log.txt '\ + 'ILLUMINACLIP:NexteraPE-PE.fa:2:30:10 '\ + 'LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:' + MinLen ReturnCode, StdOut, StdErr = toolshed.run_subprocess( work_dir, command, True) return ReturnCode, StdOut, StdErr
def run_mash_info(active_folder, work_dir, out_file): """ Writes the data present in the '.msh' file to the log.txt file param: str active_folder = path to one of three possible folders param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str out_file = name of the output file output: text added to log.txt file """ print('\nrunning: Mash info') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + active_folder\ + ':' + Mash_WorkingDir + '" '\ + '-i ' + Mash_image + ' mash info '\ + out_file # look up the genomes present in the .msh file and print to the log file ReturnCode, StdOut, StdErr = toolshed.run_subprocess(work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nMash sketch results:\n', StdOut, file=log_file)
def run_qualimap(work_dir, suffix): ''' Creates an index ('.amb', '.ann', '.bwt', '.pac', '.sa') for a FASTA file. param: str work_dir = isolate-specific folder, e.g.: 'WH200812_001259/' param: str suffix = distinguishes files if more than one reference was used for read mapping return: ReturnCode, StdOut, StdErr ''' print('\nrunning: Qualimap') command = 'docker run --rm=True -u $(id -u):$(id -g) '\ + '-v "' + BASE_PATH + TEMP_dir + work_dir\ + ':' + Qualimap_WorkingDir + '" '\ + '-i ' + Qualimap_image + ' qualimap bamqc '\ + '-bam marked_duplicates' + suffix + '.bam' ReturnCode, StdOut, StdErr = toolshed.run_subprocess( work_dir, command, True) with open(BASE_PATH + TEMP_dir + work_dir + 'log.txt', 'a') as log_file: print('\nqualimap:\n', StdOut, file=log_file) return ReturnCode, StdOut, StdErr