def combine_alignments(file_list, output): assert(all([__is_mapped(x) for x in file_list])) command = ("java -Xmx{xmx} -Xms{xms} -Djava.io.tmpdir={tmp} -jar {picard} " "-T MergeSAMFiles {input_files} O={output_files}").format( xms="10G", xmx="12G", tmp="~/tmp", picard="~/.prog/picard-tools-2.5.0/picard.jar", input_files="".join(["I={} ".format(x) for x in file_list]), output_files=output ) # Use picard merge SAM to merge the files bash(command)
def __merge_bam(self, files, combined_name): if self.dry_run: return try: if not isfile(combined_name): bash("samtools merge {} {}".format(combined_name, " ".join(files))) else: if self.verbose: print("{} already exists, using it...".format( combined_name), file=stderr) except (IOError, OSError) as err: print("Error while combining files: {}".format(err), file=stderr) raise (err)
def __merge_bam_files(self): bam_files = [f for f in self.files if f.endswith(".bam")] output_bam = join(self.input_root, self.all_reads_name) def __samtools_threads(): return (str(min(self.get_threads(), 8))) if isfile(output_bam): print("Output BAM already exists, using it...", file=stderr) return(output_bam) try: (out, err) = bash("samtools -@ {t} -m {mem_p_t} " "{o_bam} {i_bams}").format(t=__samtools_threads(), mem_p_t="2G", o_bam=output_bam, i_bams=" ".join( bam_files)) if err: print(err, file=stderr) except Exception as error: print("Error while merging bam files: {}".format(error), file=stderr) raise (error) return (output_bam)
def sbatch(command, *args): script = "echo '#!/usr/bin/env bash\n{}' | sbatch".format(command) for argument in args: script += " {}".format(argument) return (bash(script))
def qsub(command, *args): script = ("echo '#!/usr/bin/env bash\n{}' | qsub".format(command)) for argument in args: script += " {}".format(argument) return (bash(script))
def get_barcode(filename): display_filename = "cat {}".format(filename) if filename.endswith(".gz"): display_filename = "gunzip -c {}".format(filename) command = ("{} | head -n 10000 | grep ^@ | cut -d':' -f10 | tr -d ' ' " "| sort | uniq -c | sort -nr | head -1 | sed -e " "'s/^[[:space:]]*//' | cut -d ' ' -f2").format(display_filename) return bash(command)[0]
def main(reference="reference.fa", job_prefix="Map_Tests"): speeds = ["vfast", "fast", "normal", "slow", "vslow"] modulos = ["--usemodulo"] stats = ["--stats", ""] reads = ["100"] command = ("map.py --verbose --partition=bigmemm --memory={mem} --cpus=14 " "[email protected] --extension=.fastq.gz$ " "--email_options=FAIL,END --input_root=ErrCorrect_Repair.1 " "--output_root={outroot} --job_name={jobname} {modulo} --pigz " "--speed={speed} {stats} --num_reads={reads} " "--reference={reference} --read_groups") for i, speed in enumerate(speeds): for j, modulo in enumerate(modulos): for k, stat in enumerate(stats): for l, read in enumerate(reads): mem = "300G" def __opts(): s = speed m = "no_modulo" st = "no_stats" r = read if j == 0: m = "modulo" if k == 0: st = "stats" return {'s':s, 'm':m, 'st':st, 'r':r} jobname = job_prefix + ".{s}.{m}.{st}.{r}.".format( **__opts() ) outroot = "Map_Tests/Map_Test.{s}.{m}.{st}.{r}".format( **__opts() ) args = split(command.format( speed=speed, modulo=modulo, stats=stat, reads=read, mem=mem, jobname=jobname, outroot=outroot, reference=ref )) (out, err) = bash(*args) print(out) print(err, file=stderr)
def main(fastq, mapped): (fastq_sizes, err) = bash("find -L {f}/ -iname *q.gz | " "parallel --gnu -j4 \"du -sb --apparent-size " "{{}}\"".format(f=fastq.rstrip('/'))) (map_sizes, err2) = bash("find -L {f}/ -iname *001.sam | parallel " "--gnu -j4 \"du -sb --apparent-size {{}}\"".format( f=mapped.rstrip('/'))) read1 = {} read2 = {} mapped = {} for line in fastq_sizes.splitlines(): chunks = line.split() name = basename(chunks[1]).split("_R")[0] size = chunks[0] if "_R1" in chunks[1]: read1[name] = size if "_R2" in chunks[1]: read2[name] = size for line in map_sizes.splitlines(): chunks = line.split() name = basename(chunks[1]).split("_pe")[0] size = chunks[0] mapped[name] = size table = {} for key, val in mapped.items(): table[key] = [read1[key], read2[key], val] with open("filesizes", "w") as fh: for key, val in table.items(): fh.write("{}\t{}\t{}\t{}\n".format(key, *val)) (out, err) = bash("plot_map_sizes.R") print(out)
def rgpu(self, filename): lane = "1" d_filename = "cat {}".format(filename) if filename.endswith(".gz"): d_filename = "gunzip -c {}".format(filename) command1 = ("{} | head -n 10000 | grep ^@ | cut -d':' -f10 | tr -d ' ' " "| sort | uniq -c | sort -nr | head -1 | sed -e " "'s/^[[:space:]]*//' | cut -d ' ' -f2").format(d_filename) try: barcode = bash(command1)[0].strip() if self.verbose: print("Barcode: {bar}".format(bar=barcode), file=stderr) except: print("Could not determine barcode", file=stderr) try: lane = int(search("(?<=_L)[0-9]{1,3}(?=.*_R[1|2])", filename).group(0)) except AttributeError: command2 = ("{} | head -n 10000 | grep ^@ | cut -d':' -f4 | tr -d " "' ' | sort | uniq -c | sort -nr | head -1 | sed -e " "'s/^[[:space:]]*//' | cut -d ' ' -f2").format( d_filename ) # strip lane number try: lane = bash(command2)[0].strip() if self.verbose: print("Lane: {lane}".format(lane=lane)) except: print("Could not determine lane number", file=stderr) return("{}.{}".format(barcode, lane))
def scontrol(*args): return (bash("scontrol", *args))
def qstat(*args): return (bash("qstat", *args))
def squeue(*args): return (bash("squeue", *args))
def scancel(*args): return (bash("scancel", *args))
def qjob(job): return (bash("qstat -j", job))
def submit_job(command_str, verbose=False, dry_run=False, **kwargs): """ Anticipated positional args: command_str - The command to be wrapped for submission to scheduler Anticipated keyword args: memory - The memory to be allocated to this job nodes - The nodes to be allocated cpus - The cpus **per node** to request partition - The queue name or partition name for the submitted job job_name - The name of the job depends_on - The dependencies (as comma separated list of job numbers) email_address - The email address to use for notifications email_options - Email options: START|BEGIN,END|FINISH,FAIL|ABORT time - time to request from the scheduler bash - The bash shebang line to use in the script input - The input filename for the job output - The output filename for the job error - The error filename for the job """ shebang_line = "#!/usr/bin/env bash" if "bash" in kwargs: shebang_line = kwargs["bash"] script = ("{shebang_line}\n{command}").format(shebang_line=shebang_line, command=command_str) sub_command = ("echo '{}' | {}") sub_script = "" # Will hold entire string that will be send to bash shell if get_backend() == "slurm": # Format with slurm options sub_script = sub_command.format(script, __submit_slurm(**kwargs)) elif get_backend() == "torque": # Format with torque options sub_script = sub_command.format(script, __submit_torque(**kwargs)) if verbose: print(sub_script, file=sys.stderr) stdout = "" stderr = "" if not dry_run: (stdout, stderr) = bash( sub_script) # Actaully call the script using bash try: # To parse the output based on expected successful submission result chunks = stdout.split(" ") for chunk in chunks: if any([x.isdigit() for x in chunk.strip()]): return ( chunk.strip()) # First try to grab IDs from sentences if get_backend() == "slurm": # If still here, try common output formats # Successfully submitted job <Job ID> return (stdout.split(" ")[-1].strip("\n")) if get_backend() == "torque": # <Job ID>.hostname.etc.etc return (stdout.split(".")[0]) if stderr: print(stderr, file=stderr) except (ValueError, IndexError) as err: print("Could not capture Job ID! Dependency checks may fail!") print("Err: {}".format(err)) return ("") else: return ("")
def qdel(*args): return (bash("qdel", *args))
def qalter(*args): return (bash("qalter", *args))
def qresub(*args): return (bash("qresub", *args))