def run_quast(contigs, reads, sample, threads, quast_opts): # the current quast bioconda recipe has issues with bedtools # so the read functionality is not working # the command is preserved here for future use # cmd = f"quast --threads {threads} -o {sample}/quast --p1 {reads[0]} --pe2 {reads[1]} {quast_opts} {contigs}" cmd = f"quast --threads {threads} -o {sample}/quast --glimmer {quast_opts} {contigs}" shell(cmd)
def mergeBenchmarks(samples, rules): ## Keep merged and unmerged benchmarks in seperate dirs snakemake.shell( 'mkdir benchmark/merged benchmark/unmerged -p 2> /dev/null') ## If there's a single sample, convert to list to avoid looping by char if (isinstance(samples, str)): samples = [samples] for rule in rules: f = 'benchmark/merged/' + rule + '.tab' ## Don't add the header if we don't have to if (not os.path.isfile(f)): snakemake.shell( 'echo -e "s\th:m:s\tmax_rss\tmax_vms\tmax_uss\tmax_pss\tio_in\tio_out\tmean_load" >> {f}' ) for sample in samples: snakemake.shell( 'tail -n1 benchmark/{sample}_*_{rule}.tab >> {f} 2> /dev/null || true' ) snakemake.shell( 'mv benchmark/{sample}_*_{rule}.tab benchmark/unmerged 2> /dev/null|| true ' ) ## Move non-sample dependent rule benchmarks to merged snakemake.shell('mv benchmark/*tab benchmark/merged 2> /dev/null || true')
def main(args): final_output_ext = args.output.split(".")[-1] if os.path.getsize(args.draft) == 0: # Create empty polished file shell("touch {}".format(args.output)) else: final_output = args.output.replace('{}x'.format(args.iterations), '{nrepeat}x') for repeat in range(1, args.iterations + 1): polished_contigs = final_output.format(nrepeat=repeat) previous_polished_draft = final_output.format(nrepeat=repeat - 1) alignment = polished_contigs.replace(final_output_ext, '.aln.sam') if repeat == 1: previous_polished_draft = args.draft shell('minimap2 -t {args.threads} -ax map-ont ' '{previous_polished_draft} {args.raw_reads} > {alignment}; ' 'racon ' '--include-unpolished ' '--quality-threshold={args.min_q} ' '-t {args.threads} ' '{args.raw_reads} ' '{alignment} ' '{previous_polished_draft} > {polished_contigs}')
def align_to_genome_fasta_pe(fasta1, fasta2, genome_path, out_sam, threads=1, verbose=False, additional_flags=''): if verbose: command = "bwa mem -t {threads} {genome_path} {fastq1} {fastq2} {additional_flags} > {out_sam}; ".format( genome_path=genome_path, fastq1=fasta1, fastq2=fasta2, out_sam=out_sam, threads=threads, additional_flags=additional_flags) else: command = "bwa mem -t {threads} {genome_path} {fastq1} {fastq2} {additional_flags} 2> /dev/null 1> {out_sam}; ".format( genome_path=genome_path, fastq1=fasta1, fastq2=fasta2, out_sam=out_sam, threads=threads, additional_flags=additional_flags) logger.debug("Executing command: %s" % command) shell(command) if isfile(out_sam): return True else: return False
def render_rmarkdown(input_file, output_file, root_dir, params=None): """ Snakemake wrapper function to render an Rmarkdown document the way I want it to. In particular, this function uses bookdown instead of rmarkdown to enable figure/table enumeration and allows to pass parameters to a parametrized report. Args: input_file: path to input (Rmd) file output_file: path to output (html) file root_dir: knitr working directory (python/R will be executed in this directory) params: dictionary that will be passed to `params` arg of `rmarkdown::render`. """ param_str = "" if params is not None: param_str = ", ".join([ "{}={}".format(key, _literal_to_r_str(value)) for key, value in params.items() ]) cmd = ( "MKL_THREADING_LAYER=GNU " # was necessary to circumvent incompatibilities of Intel mkl with libgomp. "Rscript -e \"rmarkdown::render('{input_file}', " " output_file='{output_file}', " " output_format=bookdown::html_document2(), " " knit_root_dir='{root_dir}', " " params = list({params}))\"").format( input_file=os.path.abspath(input_file), output_file=os.path.abspath(output_file), root_dir=os.path.abspath(root_dir), params=param_str) shell(cmd)
def install(cls): fn1=cls.download_file("http://www.pegase-biosciences.com/wp-content/uploads/2013/04/CuReSim1.2.zip","curesim.zip") fn2=cls.download_file("http://www.pegase-biosciences.com/wp-content/uploads/2013/04/CuReSimEval1.1.zip","curesim_eval.zip") dir=os.path.dirname(fn1) snakemake.shell('(cd "{dir}" && unzip -j -o curesim.zip && unzip -j -o curesim_eval.zip) > /dev/null'.format(dir=dir)) cls.install_file("CuReSim.jar",CURESIM) cls.install_file("CuReSimEval.jar",CURESIM_EVAL)
def run_einverted(fasta, gap=12, threshold=15, match=3, mismatch=-4, outfile='einverted.tmp.out', outseq='einverted.tmp.outseq'): command = 'einverted -sequence {fasta} -gap {gap} -threshold {threshold} -match {match} ' \ '-mismatch {mismatch} -outfile {outfile} -outseq {outseq} -auto Y -warning N'.format( fasta=fasta, gap=gap, threshold=threshold, match=match, mismatch=mismatch, outfile=outfile, outseq=outseq ) #print(command) shell(command)
def compute_syrah_to_s3(self, sra_id): from boto.s3.connection import S3Connection from boto.s3.key import Key from snakemake import shell conn = S3Connection() bucket = conn.get_bucket("soursigs-done") # Check if file is already on S3 key = bucket.get_key(os.path.join("sigs", sra_id)) if key is None: # result not available yet, compute it with NamedTemporaryFile("w+t") as f: try: shell( "fastq-dump -A {sra_id} -Z | syrah | " "sourmash compute -k 21 --dna - -o {output} --name {sra_id}" .format(sra_id=sra_id, output=f.name)) except CalledProcessError as e: # We ignore SIGPIPE, since it is informational (and makes sense, # it happens because `head` is closed and `fastq-dump` can't pipe # its output anymore. More details: # http://www.pixelbeat.org/programming/sigpipe_handling.html if e.returncode != 141: # TODO: save error to bucket, on 'errors/{sra_id}'? raise e # save to S3 k = Key(bucket) k.key = os.path.join("sigs", sra_id) f.seek(0) k.set_contents_from_string(f.read()) raise Ignore()
def map_reads(self,number_of_threads=1): if self._fq2_fn==None: reads_string='"{}"'.format(self._fq1_fn) else: reads_string='"{}" "{}"'.format(self._fq1_fn,self._fq2_fn) if self._sort_by_name: snakemake.shell('"{yara_mapper}" -t {threads} "{genome_pref}" {reads_string} | "{samtools}" sort -n - "{bamprefix}"'.format( yara_mapper=YARA_MAPPER, genome_pref=self.index_prefix, reads_string=reads_string, bamprefix=self._bam_fn[:-4], threads=number_of_threads, samtools=smbl.prog.SAMTOOLS, ) ) else: snakemake.shell('"{yara_mapper}" -o "{bam}" -t {threads} "{genome_pref}" {reads_string}'.format( yara_mapper=YARA_MAPPER, genome_pref=self.index_prefix, reads_string=reads_string, bam=self._bam_fn, threads=number_of_threads, ) )
def test_samtools_sort_and_index(sample1_se_tiny_bam, sample1_se_tiny_bam_bai): """ This test is primarily a trigger for the fixtures. """ with pytest.raises(sp.CalledProcessError): shell('samtools view {sample1_se_tiny_bam} 2L:1-100') shell('samtools view {sample1_se_tiny_bam_bai[bam]} 2L:1-100')
def index(in_bam): shell('samtools index {in_bam}'.format(in_bam=in_bam)) if isfile(in_bam + '.bai'): return True else: return False
def create_fq(self): if self.number_of_read_tuples == 0: genome_size=os.stat(self._fa_fn).st_size self.number_of_read_tuples=int(self.coverage*genome_size/(self.read_length_1+self.read_length_2)) snakemake.shell(""" cd "{dir}" java -Xmx8g -jar \ {curesim} \ -f "{fa}" \ -n {nb} \ -m {rlen1} \ -r 0 \ -sd 0 \ -y 0 \ {other_params} \ > /dev/null """.format( dir=self.get_dir(), curesim=smbl.prog.CURESIM, fa=self._fa_fn, nb=self.number_of_read_tuples, rlen1=self.read_length_1, other_params=self.other_params, rng_seed=self._rng_seed, ) ) self.recode_curesim_reads( os.path.join( self.get_dir(), "output.fastq", ) )
def run_asm(infiles, assembler, asm_opts, sample, threads, memory): if assembler == 'shovill': cmd = f"shovill --R1 {infiles[0]} --R2 {infiles[1]} --outdir {sample}/shovill --cpus {threads} --ram {memory} {asm_opts}" elif assembler == 'spades': cmd = f"spades.py -1 {infiles[0]} -2 {infiles[1]} -o {sample}/spades --threads {threads} --memory {memory} {asm_opts}" else: cmd = f"skesa --fastq {','.join(infiles)} --contigs_out {sample}/skesa.fasta --cores {threads} --memory {memory} {asm_opts}" shell(cmd)
def relative_symlink(input, output): """ Helper function to easily symlink two files """ import os from snakemake import shell odir = os.path.dirname(output) oname = os.path.basename(output) relative_path = './' + os.path.relpath(str(input), odir) shell("cd {odir}; ln -s {rp} {output}".format(odir=odir, rp=relative_path, output=oname))
def cluster_reciprocal_identity(infile, outfile, threads=1, memory=800, alignment_coverage=0.99, perc_identity=0.99): shell('cd-hit-est -g 1 -aL {cov} -aS {cov} -d 0 -c {perc_identity} -T {threads} ' '-M {memory} -i {infile} -o {outfile}'.format( cov = alignment_coverage, perc_identity = perc_identity, threads=threads, memory=memory, infile = infile, outfile = outfile ))
def archiveLog(log): t = datetime.datetime.now().strftime('%Y-%m-%d.h%H-m%M-s%S') new_log = log + '.' + t try: shutil.copy2(log, new_log) snakemake.shell('gzip -9 ' + new_log) except (FileNotFoundError, PermissionError) as e: pass
def index_genome(genome_path, silence=True): if silence: shell('bwa index {genome_path} &> /dev/null;'.format( genome_path=genome_path)) else: shell('bwa index {genome_path}'.format(genome_path=genome_path)) return genome_is_indexed(genome_path)
def _control_script(igvcommands, igv_fp, igv_prefs): igvscript = tempfile.NamedTemporaryFile() igvscript.writelines(map(lambda x: bytes(x + '\n', 'ascii'), igvcommands)) igvscript.flush() igvprefsfile = _write_prefs(igv_prefs) igvcommandstring = "xvfb-run -a -s '-screen 1 1920x1080x24' %s -o %s -b %s" % ( igv_fp, igvprefsfile.name, igvscript.name) print(igvcommandstring) shell(igvcommandstring)
def install(cls): last_version="last-548" fn=cls.download_file("http://last.cbrc.jp/{}.zip".format(last_version),"last.zip") dir1=os.path.dirname(fn) snakemake.shell('(cd "{dir1}" && unzip last.zip)') dir2=os.path.join(dir1,last_version) cls.run_make(dir2) cls.install_file("{}/src/lastal".format(dir2),LASTAL) cls.install_file("{}/src/lastdb".format(dir2),LASTDB)
def map_reads(self,number_of_threads=1): snakemake.shell('"{storm}" -M 4 -A -g "{genome}" -r "{reads}" -N "{threads}" | "{samtools}" view -bS - > "{bam}"'.format( storm=STORM_NUCLEOTIDE, samtools=smbl.prog.SAMTOOLS, genome=self._fa_fn, reads=self._fq1_fn, bam=self._bam_fn, threads=number_of_threads, ) )
def sort_coordinate(in_bam, out_bam, delete_in_bam=False): shell('samtools sort {in_bam} > {out_bam}'.format(in_bam=in_bam, out_bam=out_bam)) if delete_in_bam: shell('rm {in_bam}'.format(in_bam=in_bam)) if isfile(out_bam): return True else: return False
def index_genome(genome_path, silence=True): if silence: shell( 'makeblastdb -dbtype nucl -in {genome_path} &> /dev/null;'.format( genome_path=genome_path)) else: shell('makeblastdb -dbtype nucl -in {genome_path}'.format( genome_path=genome_path)) return genome_is_indexed(genome_path)
def create_fq(self): if self.coverage == 0: genome_size=os.stat(self._fa_fn).st_size self.coverage = 1.0 * self.number_of_read_tuples * (self.read_length_1+self.read_length_2) / (0.8 * genome_size) if self._reads_in_tuple==2: paired_params="-p -m {dist} -s {dist_dev}".format( dist=self.distance, dist_dev=self.distance_deviation, ) else: paired_params="" command_1 =""" {art_il} -sam -na \ -i "{fasta}" \ -l {rlen} \ -rs {rng_seed} \ -f {coverage} \ -o "{o_pref}" \ {paired_params} \ {other_params} \ > /dev/null """.format( art_il=smbl.prog.ART_ILLUMINA, paired_params=paired_params, fasta=self._fa_fn, rlen=self.read_length_1, other_params=self.other_params, coverage=self.coverage, o_pref=self.art_prefix, rng_seed=self._rng_seed, ) # correction of header (bug in ART) command_2 =""" cat "{sam_1}" | \ grep -v ^@ | \ "{samtools}" view -h -T "{fa}" - \ > "{sam_2}" """.format( samtools=smbl.prog.SAMTOOLS, sam_1=self._sam1_fn, sam_2=self._sam2_fn, fa=self._fa_fn, ) snakemake.shell(command_1) snakemake.shell(command_2) self.recode_sam_reads( sam=self._sam2_fn, simulator_name="art-illumina", )
def remove_secondary_alignments(in_bam, out_bam, delete_in_bam=False): shell('samtools view -b -h -F 0x900 {in_bam} > {out_bam}'.format( in_bam=in_bam, out_bam=out_bam)) if delete_in_bam: shell('rm {in_bam}'.format(in_bam=in_bam)) if isfile(out_bam): return True else: return False
def run(self, output_filename=None, output_filename_classified=None, output_filename_unclassified=None, only_classified_output=False): """Performs the kraken analysis :param str output_filename: if not provided, a temporary file is used and stored in :attr:`kraken_output`. :param str output_filename_classified: not compressed :param str output_filename_unclassified: not compressed """ if output_filename is None: self.kraken_output = TempFile().name else: self.kraken_output = output_filename params = { "database": self.database, "thread": self.threads, "file1": self.fastq[0], "kraken_output": self.kraken_output, "output_filename_unclassified": output_filename_unclassified, "output_filename_classified": output_filename_classified, } if self.paired: params["file2"] = self.fastq[1] command = "kraken %(file1)s " if self.paired: command += " %(file2)s --paired" command += " -db %(database)s " command += " --threads %(thread)s --output %(kraken_output)s --out-fmt legacy" # for kraken <=1.0 --out-fmt did not exist #command += " --out-fmt legacy" if output_filename_unclassified: command += " --unclassified-out %(output_filename_unclassified)s " if only_classified_output is True: command += " --only-classified-output" if output_filename_classified: command += " --classified-out %(output_filename_classified)s " command = command % params # Somehow there is an error using easydev.execute with pigz from snakemake import shell shell(command)
def compute(sra_id): import boto3 import botocore from snakemake import shell conn = boto3.client("s3") s3 = boto3.resource("s3") key_path = os.path.join("sigs", sra_id + ".sig") try: s3.Object("wort-sra", key_path).load() except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": pass # Object does not exist, let's compute it later else: # Something else has gone wrong raise else: # The key already exists return with NamedTemporaryFile("w+b") as f: try: shell( "fastq-dump --disable-multithreading --fasta 0 --skip-technical --readids --read-filter pass --dumpbase --split-spot --clip -Z {sra_id} | " "sourmash compute -k 21,31,51 " " --scaled 1000 " " --track-abundance " " --name {sra_id} " " -o {output} " " - ".format(sra_id=sra_id, output=f.name)) except CalledProcessError as e: # We ignore SIGPIPE, since it is informational (and makes sense, # it happens because `head` is closed and `fastq-dump` can't pipe # its output anymore. More details: # http://www.pixelbeat.org/programming/sigpipe_handling.html if e.returncode != 141: raise e f.seek(0) compressed_fp = BytesIO() with gzip.GzipFile(fileobj=compressed_fp, mode="wb") as gz: shutil.copyfileobj(f, gz) conn.put_object( Body=compressed_fp.getvalue(), Bucket="wort-sra", Key=key_path, ContentType="application/json", ContentEncoding="gzip", )
def get_contigs(sample, assembler): if assembler == 'shovill': shell( f"mv {sample}/shovill/contigs.fa {sample}/shovill.fasta && rm -rf {sample}/shovill" ) return f"{sample}/shovill.fasta" elif assembler == 'spades': shell( f"mv {sample}/spades/contigs.fasta {sample}/spades.fasta && rm -rf {sample}/spades" ) return f"{sample}/spades.fasta" else: return f"{sample}/skesa.fasta"
def _inferseq_database(pairsfile, inferseq_database, min_perc_identity, max_internal_softclip_prop, max_edge_distance, output_file, keep_intermediate): index_database(inferseq_database) database_dict = {rec.id: rec.seq for rec in SeqIO.parse(inferseq_database, 'fasta')} tmp_dir = dirname(output_file) pairs = pd.read_csv(pairsfile, sep='\t', keep_default_na=False, na_values=[ '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A','N/A', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan']) handle_empty_pairsfile(pairs, output_file) logger.info("Aligning pairs to database...") assembly_flanks_fasta_prefix = write_flanks_to_align_to_database(pairs, tmp_dir) assembly_outbam = join(tmp_dir, 'mustache.inferseq_database.' + str(randint(0, 1e20)) + '.bam') bowtie2tools.align_fasta_to_genome( assembly_flanks_fasta_prefix + '.fasta', inferseq_database, assembly_outbam, silence=True, additional_flags='--all --score-min G,1,5' ) logger.info("Inferring sequences from pairs aligned to database...") sequences_inferred_database = infer_sequences_database(assembly_outbam, database_dict, min_perc_identity, max_internal_softclip_prop, max_edge_distance) if not keep_intermediate: shell('rm {fasta_prefix}* {outbam}*'.format(fasta_prefix=assembly_flanks_fasta_prefix, outbam=assembly_outbam)) method1 = make_dataframe(sequences_inferred_database, method='inferred_database') all_inferred_results = method1.sort_values( by=['pair_id', 'method'] ) all_inferred_results.loc[:, 'pair_id'] = list(map(str, map(int, list(all_inferred_results['pair_id'])))) all_inferred_results = all_inferred_results.query("inferred_seq_length > 0") logger.info("Writing results to file %s..." % output_file) if not output_file: output_file = 'mustache.inferseq_database.tsv' if pairs.shape[0] > 0: sample_id = list(pairs['sample'])[0] all_inferred_results.insert(0, 'sample', sample_id) else: all_inferred_results.insert(0, 'sample', None) all_inferred_results.to_csv(output_file, sep='\t', index=False)
def bootstrap_read_counts(peaks, bam_path, total_reads): indices = sorted( list(np.random.choice(range(total_reads), total_reads, replace=True))) i = 0 bam = pysam.AlignmentFile(bam_path, 'rb') outfile = pysam.AlignmentFile("boot.tmp.sam", "w", template=bam) for read in bam: try: while i == indices[0]: outfile.write(read) indices = indices[1:] except: break i += 1 outfile.close() shell('samtools view -b boot.tmp.sam -o boot.tmp.bam') shell('samtools sort -o boot.tmp.sorted.bam boot.tmp.bam') shell('samtools index boot.tmp.sorted.bam') boot_read_counts = empirical_read_counts( peaks, pysam.AlignmentFile('boot.tmp.sorted.bam', 'r')) shell('rm boot.tmp*') return boot_read_counts
def map_reads(self,number_of_threads=1): if self._fq2_fn==None: reads_string='"{}"'.format(self._fq1_fn) else: reads_string='"{}" "{}"'.format(self._fq1_fn,self._fq2_fn) snakemake.shell('("{razer}" -o "{bam}" -tc {threads} "{genome}" {reads_string}) > /dev/null'.format( razer=RAZERS3, genome=self._fa_fn, reads_string=reads_string, bam=self._bam_fn, threads=number_of_threads, ) )
def map_reads(self,number_of_threads=1): if self._fq2_fn==None: reads_string='"{}"'.format(self._fq1_fn) else: reads_string='-1 "{}" -2 "{}"'.format(self._fq1_fn,self._fq2_fn) snakemake.shell('"{bt2}" -p {threads} -x "{idx}" {reads_string} | "{samtools}" view -bS - > "{bam}"'.format( bt2=BOWTIE2, samtools=smbl.prog.SAMTOOLS, idx=self._fa_fn, reads_string=reads_string, bam=self._bam_fn, threads=number_of_threads, ) )
def map_reads(self,number_of_threads=1): if self._fq2_fn==None: reads_string='"{}"'.format(self._fq1_fn) else: reads_string='"{}" "{}"'.format(self._fq1_fn,self._fq2_fn) snakemake.shell('"{bwa}" bwasw -t {threads} "{idx}" {reads_string} | "{samtools}" view -bS - > "{bam}"'.format( bwa=BWA, samtools=smbl.prog.SAMTOOLS, idx=self._fa_fn, reads_string=reads_string, bam=self._bam_fn, threads=number_of_threads, ) )
def map_reads(self,number_of_threads=1): if self._fq2_fn==None: reads_string='"{}"'.format(self._fq1_fn) else: reads_string='"{}" "{}"'.format(self._fq1_fn,self._fq2_fn) snakemake.shell('"{gsnap}" -A sam -d {idx} -t {threads} {reads_string} | "{samtools}" view -bS - > "{bam}"'.format( gsnap=GSNAP, samtools=smbl.prog.SAMTOOLS, idx=self._fa_fn, reads_string=reads_string, bam=self._bam_fn, threads=number_of_threads, ) )
def _workflow(workdir, snakefile, configfile, cores, memory, unlock, rerun_incomplete, keep_going): cmd = 'snakemake -s {snakefile} --config wd={workdir} memory={memory} ' \ '--cores {cores} --configfile {configfile} ' if rerun_incomplete: cmd += '--rerun-incomplete ' if keep_going: cmd += '--keep-going ' if unlock: cmd += '--unlock ' cmd = cmd.format(snakefile=snakefile, configfile=configfile, workdir=workdir, memory=memory, cores=cores) print('COMMAND:', cmd) shell(cmd)
def map_reads(self, number_of_threads=1): if self._fq2_fn == None: reads_string = '"{}"'.format(self._fq1_fn) else: reads_string = '"{}" "{}"'.format(self._fq1_fn, self._fq2_fn) snakemake.shell( '"{gsnap}" -A sam -d {idx} -t {threads} {reads_string} | "{samtools}" view -bS - > "{bam}"' .format( gsnap=GSNAP, samtools=smbl.prog.SAMTOOLS, idx=self._fa_fn, reads_string=reads_string, bam=self._bam_fn, threads=number_of_threads, ))
def _wholegenome(reference, query, read_length, read_depth, min_alignment_quality, max_direct_repeat_length, large_insertion_cutoff, query_id, output_prefix): min_alignment_inner_length = max_direct_repeat_length + 1 if not bwatools.genome_is_indexed(reference): click.echo("Indexing reference...") bwatools.index_genome(reference) else: click.echo("Reference already indexed...") if not bowtie2tools.genome_is_indexed(query): click.echo("Indexing query...") bowtie2tools.index_genome(query) else: click.echo("Query already indexed...") click.echo("Making query reads...") query_reads_path = output_prefix + '.query.tmp.fq' make_reads(query, query_reads_path, read_length, read_depth) click.echo("Aligning query reads to reference...") out_sam = output_prefix + '.query.reference.tmp.sam' bwatools.align_to_genome_se(query_reads_path, reference, out_sam, threads=1, verbose=False) click.echo("Sorting and indexing alignment file...") out_bam = output_prefix + '.query.reference.tmp.bam' samtools.sort_coordinate(out_sam, out_bam, delete_in_bam=True) samtools.index(out_bam) find_file = output_prefix + '.find.tsv' _find(out_bam, min_softclip_length=8, min_softclip_count=1, min_alignment_quality=min_alignment_quality, min_alignment_inner_length=min_alignment_inner_length, min_distance_to_mate=max_direct_repeat_length + 2, min_softclip_ratio=0.01, max_indel_ratio=0.0, large_insertion_cutoff=large_insertion_cutoff, min_count_consensus=1, sample_id=query_id, output_file=find_file) pair_file = output_prefix + '.pair.tsv' _pair(find_file, out_bam, reference, max_direct_repeat_length=max_direct_repeat_length, min_alignment_quality=min_alignment_quality, min_alignment_inner_length=min_alignment_inner_length, max_junction_spanning_prop=0.01, large_insertion_cutoff=large_insertion_cutoff, output_file=pair_file) inferseq_file = output_prefix + '.inferseq.tsv' _inferseq_assembly(pair_file, out_bam, query, reference, min_perc_identity=0.95, max_internal_softclip_prop=0.01, max_inferseq_size=500000, min_inferseq_size=30, keep_intermediate=False, output_file=inferseq_file) shell('rm %s %s %s' % (query_reads_path, out_bam, out_bam+'.bai'))
def run(self, output_filename=None, output_filename_classified=None, output_filename_unclassified=None, only_classified_output=False): """Performs the kraken analysis :param str output_filename: if not provided, a temporary file is used and stored in :attr:`kraken_output`. :param str output_filename_classified: not compressed :param str output_filename_unclassified: not compressed """ if output_filename is None: self.kraken_output = TempFile().name else: self.kraken_output = output_filename params = { "database": self.database, "thread": self.threads, "file1": self.fastq[0], "kraken_output": self.kraken_output, "output_filename_unclassified": output_filename_unclassified, "output_filename_classified": output_filename_classified, } if self.paired: params["file2"] = self.fastq[1] command = "kraken -db %(database)s %(file1)s " if self.paired: command += " %(file2)s --paired" command += " --threads %(thread)s --output %(kraken_output)s " command += " --out-fmt legacy" if output_filename_unclassified: command += " --unclassified-out %(output_filename_unclassified)s " if only_classified_output is True: command += " --only-classified-output" if output_filename_classified: command += " --classified-out %(output_filename_classified)s " command = command % params # Somehow there is an error using easydev.execute with pigz from snakemake import shell shell(command)
def compute_syrah(sra_id): from snakemake import shell with NamedTemporaryFile('w+t') as f: try: shell('fastq-dump -A {sra_id} -Z | syrah | ' 'sourmash compute -k 21 --dna - -o {output} --name {sra_id}'. format(sra_id=sra_id, output=f.name)) except CalledProcessError as e: # We ignore SIGPIPE, since it is informational (and makes sense, # it happens because `head` is closed and `fastq-dump` can't pipe # its output anymore. More details: # http://www.pixelbeat.org/programming/sigpipe_handling.html if e.returncode != 141: raise e f.seek(0) return f.read()
def _to_fastX(self, mode, output_filename, threads=2): """ :param mode: fastq or fasta """ # for now, we use samtools # can use bamtools as well but as long and output 10% larger (sequences # are split on 80-characters length) from snakemake import shell cmd = "samtools %s -@ %s %s > %s" % (mode, threads, self.filename, output_filename) logger.info("Please be patient") logger.info("This may be long depending on your input data file: ") logger.info("typically, a minute per 500,000 reads") shell(cmd) logger.info("done")
def _control_socket(igvcommands, igv_fp, igv_prefs): igvprefsfile = _write_prefs(igv_prefs) # Start up IGV. Use a port between 10000 and the max available, based # on the PID of this process. (TODO is using this pid safe?) port = 10000 + os.getpid() % (2**16 - 10000) xauth = "/tmp/xauth-%d" % os.getpid() xvfb_cmdline = [ "xvfb-run", "-a", "-l", "-f", xauth, "-s", "-screen 1 1920x1080x24" ] igv_cmdline = [str(igv_fp), "-p", str(port), "-o", igvprefsfile.name] igvproc = subprocess.Popen(xvfb_cmdline + igv_cmdline) # Connect to running IGV s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) while True: try: s.connect(('localhost', port)) break except ConnectionRefusedError: time.sleep(0.5) # Figure out what X11 display the IGV process is using. It should be # the second child of the initial xfvb-run process (the first child # being Xvfb, I think.) with open("/proc/%s/task/%s/children" % (igvproc.pid, igvproc.pid)) as f: child_pid = f.read().split()[1] with open("/proc/%s/environ" % child_pid) as f: env_vars = [ env_var.split('=', 1) for env_var in f.read().split('\x00') ] env_vars = { env_var[0]: env_var[1] for env_var in env_vars if len(env_var) == 2 } display = env_vars['DISPLAY'] # Based on http://unix.stackexchange.com/questions/5999/ : # This should make the window as large as the virtual X display, but in # practice my screenshots aren't going over 1280 x 1296. shell( "DISPLAY=" + display + " XAUTHORITY=" + xauth + " xdotool search --onlyvisible --name IGV windowsize --sync 100% 100%") # Generate screenshot s.sendall(bytes('\n'.join(igvcommands), 'ascii')) s.close() igvproc.wait()
def main(peak_files): merged_peaks = shell('cat %s | bedtools sort -i stdin | bedtools merge -i stdin' % ' '.join(peak_files), iterable=True) for peak in merged_peaks: print(peak.split())
def align_fasta_to_genome(fasta, genome_path, outfile, threads=1, silence=True, additional_flags=''): shell( 'blastn -query {fasta} -db {genome_path} -outfmt 5 -max_target_seqs 100000 -out {outfile} -parse_deflines ' '{additional_flags}'.format(fasta=fasta, genome_path=genome_path, outfile=outfile, additional_flags=additional_flags)) if isfile(outfile): return True else: return False
def create_fq(self): if self.coverage == 0: genome_size=os.stat(self._fa_fn).st_size self.coverage = 1.0 * self.number_of_read_tuples * (self.read_length_1+self.read_length_2) / (0.8 * genome_size) if self._reads_in_tuple==2: paired_params='--fragment-mean-size {dist} --fragment-size-std-dev {dist_dev} -or "{fq2}"'.format( dist=self.distance, dist_dev=self.distance_deviation, fq2=self.mason_prefix+"2.fq", ) else: paired_params="" command =""" {mason} \ -n {number_of_read_tuples} \ -ir "{fasta}" \ --illumina-read-length {rlen} \ --seed {rng_seed} \ -o "{fq1}" \ -oa "{sam}" \ {paired_params} \ {other_params} \ > /dev/null """.format( mason=smbl.prog.MASON_SIMULATOR, paired_params=paired_params, fasta=self._fa_fn, rlen=self.read_length_1, other_params=self.other_params, number_of_read_tuples=self.number_of_read_tuples, fq1=self.mason_prefix+"1.fq", rng_seed=self._rng_seed, sam=self._sam_fn, ) snakemake.shell(command) self.recode_sam_reads( sam=self._sam_fn, simulator_name="mason", )
def run(self, output_filename_classified=None, output_filename_unclassified=None, only_classified_output=False): """Run the analysis using Kraken and create the Krona output .. todo:: reuse the KrakenResults code to simplify this method. """ # Run Kraken (KrakenAnalysis) kraken_results = self.output_directory + os.sep + "kraken.out" self.ka.run( output_filename=kraken_results, output_filename_unclassified=output_filename_unclassified, output_filename_classified=output_filename_classified, only_classified_output=only_classified_output ) # Translate kraken output to a format understood by Krona and save png # image self.kr = KrakenResults(kraken_results) df = self.kr.plot(kind="pie") pylab.savefig(self.output_directory + os.sep + "kraken.png") prefix = self.output_directory + os.sep self.kr.kraken_to_json(prefix + "kraken.json", self.dbname) self.kr.kraken_to_csv(prefix + "kraken.csv", self.dbname) # Transform to Krona HTML from snakemake import shell kraken_html = self.output_directory + os.sep + "kraken.html" status = self.kr.kraken_to_krona(output_filename=prefix+"kraken.out.summary") if status is True: shell("ktImportText %s -o %s" % (prefix+"kraken.out.summary", kraken_html)) else: shell("touch {}".format(kraken_html))
def install(cls): gitdir_bcftools=cls.git_clone("git://github.com/nh13/bfast","") snakemake.shell('(cd "{}" && sh autogen.sh) > /dev/null'.format(cls.src_dir)) cls.run_configure("") cls.run_make("") cls.install_file("bfast/bfast",BFAST)
def shell(cls,command): if cls.verbosity: snakemake.shell(command) else: snakemake.shell("({}) > /dev/null".format(command))
def shell(self,command): if self.verbosity: snakemake.shell(command) else: snakemake.shell("({}) > /dev/null".format(command))
def make_index(self): snakemake.shell('("{yara_indexer}" -o "{prefix}" "{fa}") > /dev/null'.format( yara_indexer=YARA_INDEXER, prefix=self.index_prefix, fa=self._fa_fn, ))
def clean(self): """Remove all temporary files.""" snakemake.shell('rm -fR "{}" "{}"'.format(self.report_dir,self._html_fn))
def install(cls): gitdir=cls.git_clone("git://github.com/lh3/wgsim","wgsim") snakemake.shell('cd "{dir}" && gcc -g -O2 -Wall -o wgsim wgsim.c -lz -lm'.format(dir=gitdir)) cls.install_file("wgsim/wgsim",WGSIM) cls.install_file("wgsim/wgsim_eval.pl",WGSIM_EVAL)
def clean(self): """Clean working directory. """ snakemake.shell('rm -fR "{}"'.format(self.get_dir()))
def make_index(self): snakemake.shell('"{bwa}" index {fa}'.format( bwa=BWA, fa=self._fa_fn, ))
def create_fq(self): if self.number_of_read_tuples == 0: genome_size=os.stat(self._fa_fn).st_size self.number_of_read_tuples=int(self.coverage*genome_size/(self.read_length_1+self.read_length_2)) if self._reads_in_tuple==2: paired_params="-d {dist} -s {dist_dev}".format( dist=self.distance, dist_dev=self.distance_deviation, ) else: paired_params="" if self.read_length_2==0: fake_read_length_2=42 else: fake_read_length_2=self.read_length_2 snakemake.shell(""" {wgsim} \ -1 {rlen1} \ -2 {rlen2} \ -S {rng_seed} \ -N {nb} \ -e {error_rate} \ -r {mutation_rate} \ -R {indels} \ -X {prob_indel_ext} \ {haploid}\ {paired_params} \ {other_params} \ "{fa}" \ "{fq1}" \ "{fq2}" \ > /dev/null """.format( wgsim=smbl.prog.WGSIM, fa=self._fa_fn, fq1=self._tmp_fq1_fn, fq2=self._tmp_fq2_fn, nb=self.number_of_read_tuples, rlen1=self.read_length_1, rlen2=fake_read_length_2, other_params=self.other_params, paired_params=paired_params, rng_seed=self._rng_seed, haploid="-h" if self.haploid_mode else "", error_rate=self.error_rate, mutation_rate=self.mutation_rate, indels=self.indels, prob_indel_ext=self.prob_indel_ext, ) ) if self._reads_in_tuple==1: self.recode_wgsim_reads( old_fq1=self._tmp_fq1_fn, ) else: self.recode_wgsim_reads( old_fq1=self._tmp_fq1_fn, old_fq2=self._tmp_fq2_fn, )
def install(cls): ver="1.130" fn=cls.download_file("https://github.com/broadinstitute/picard/releases/download/{ver}/picard-tools-{ver}.zip".format(ver=ver),"picard.zip") dir=os.path.dirname(fn) snakemake.shell('(cd "{dir}" && unzip -j picard.zip) > /dev/null'.format(dir=dir)) cls.install_file("picard.jar",PICARD)
def make_index(self): snakemake.shell('"{bt2b}" "{fa}" "{fa}"'.format( bt2b=BOWTIE2_BUILD, fa=self._fa_fn, ))
] def all_programs(): return [ plugin.get_installation_files() for plugin in smbl.prog.plugins.get_registered_plugins() ] def all_compatible_programs(): return [ plugin.get_installation_files() for plugin in smbl.prog.plugins.get_registered_plugins() if plugin.is_platform_supported() ] snakemake.shell( """ mkdir -p "{}" "{}" "{}" """.format(bin_dir,fa_dir,src_dir) ) def is_linux(): return sys.platform.startswith('linux') def is_cygwin(): return sys.platform.startswith('cygwin') def is_windows(): return sys.platform.startswith('win') def is_osx(): return sys.platform.startswith('darwin')
import snakemake import re def shell( cmd, remove_spaces=True, async=False, iterable=False, read=False, ): if remove_spaces: #print("removing spaces from command") cmd=re.sub(r'[ \t\f\v]+',' ',cmd).strip() return snakemake.shell( cmd=cmd, async=async, iterable=iterable, read=read, )