def picard_CGbias(global_config, sample_config, sorted_alignments_by_insert): picard = ""; if os.environ.get('PICARD_HOME'): picard = os.environ.get('PICARD_HOME') elif "picard" in global_config["Tools"]: picard = global_config["Tools"]["picard"]["bin"] for library, BAMfile, working_dir in sorted_alignments_by_insert: os.chdir(working_dir) output_header = os.path.basename(BAMfile).split(".bam")[0] command= ["java", "-Xmx16g", "-XX:PermSize=2g", "-jar", os.path.join(picard, "CollectGcBiasMetrics.jar"), "REFERENCE_SEQUENCE={}".format(sample_config["reference"]), "INPUT={}".format(BAMfile), \ "OUTPUT={}.collectGcBias.txt".format(output_header), "CHART_OUTPUT={}.collectGcBias.pdf".format(output_header), "ASSUME_SORTED=true", "VALIDATION_STRINGENCY=LENIENT", "TMP_DIR=$TMPDIR"] returnValue = 0; common.print_command(command) if not os.path.exists("{}.collectGcBias.pdf".format(output_header)): if not common.check_dryrun(sample_config): stdOut = open("collectGcBias.stdOut", "w") stdErr = open("collectGcBias.stdErr", "w") returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) if not returnValue == 0: print("problem running collectGCBias") os.chdir("..") return sorted_alignments_by_insert
def picard_markDuplicates(global_config, sample_config, sorted_alignments_by_insert): picard = ""; if os.environ.get('PICARD_HOME'): picard = os.environ.get('PICARD_HOME') elif "picard" in global_config["Tools"]: picard = global_config["Tools"]["picard"]["bin"] for library, BAMfile, working_dir in sorted_alignments_by_insert: os.chdir(working_dir) output_header = os.path.basename(BAMfile).split(".bam")[0] command= ["java", "-Xmx16g", "-XX:PermSize=3g", "-jar", os.path.join(picard, "MarkDuplicates.jar"), "INPUT={}".format(BAMfile), "OUTPUT={}_noDup.bam".format( output_header),"METRICS_FILE={0}.markDuplicates.txt".format( output_header), "ASSUME_SORTED=true", "VALIDATION_STRINGENCY=LENIENT", "TMP_DIR=$TMPDIR"] returnValue = 0; common.print_command(command) if not os.path.exists("{}.markDuplicates.txt".format(output_header)): if not common.check_dryrun(sample_config): stdOut = open("removeDup.stdOut", "w") stdErr = open("removeDup.stdErr", "w") returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) if not returnValue == 0: print("problem running MarkDuplicates") os.chdir("..") return sorted_alignments_by_insert
def _run_BUSCO(global_config, sample_config, sorted_alignments_by_insert): program = global_config["Tools"]["BUSCO"]["bin"] options = global_config["Tools"]["BUSCO"]["options"] main_dir = os.getcwd() BUSCOfolder = os.path.join(main_dir, "BUSCO") if not os.path.exists(BUSCOfolder): os.makedirs(BUSCOfolder) os.chdir(BUSCOfolder) BUSCO_data_path = sample_config["BUSCODataPath"] if not os.path.exists(BUSCO_data_path): raise IOError("Path to the BUSCO data set does not exist!") reference = sample_config["reference"] output = sample_config["output"] threads = sample_config.get("threads", 16) command = [program, "-l", BUSCO_data_path, "-in", "{}".format(reference), "-o", "{}".format(output), "-c", "{}".format(threads)] command.extend(options) common.print_command(command) outfile = os.path.join(BUSCOfolder, "run_{}".format(output), "short_summary_{}".format(output)) if not common.check_dryrun(sample_config) and not os.path.exists(outfile): stdOut = open("BUSCO.stdOut", "a") stdErr = open("BUSCO.stdErr", "a") return_value = subprocess.call(command, stdout=stdOut, stderr=stdErr) if not return_value == 0: sys.exit("Error running BUSCO") os.chdir("..")
def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() FastqcFolder = os.path.join(os.getcwd(), "fastqc") if not os.path.exists(FastqcFolder): os.makedirs(FastqcFolder) program = global_config["Tools"]["fastqc"]["bin"] program_options = global_config["Tools"]["fastqc"]["options"] for library, libraryInfo in sorted_libraries_by_insert: command = [program] for option in program_options: command.append(option) read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] command.append(read1) if read2 is not None: command.append(read2) common.print_command(command) sample_config["commands"] += "\n" + common.get_command_str(command) folder_output_name = os.path.join( FastqcFolder, os.path.basename(read1).split(".fastq.gz")[0]) if not common.check_dryrun(sample_config) and not \ os.path.exists("{}_fastqc.zip".format(folder_output_name)): fastq_stdOut = open( os.path.join(FastqcFolder, "{}_fastqc.stdout".format(library)), "a") fastq_stdErr = open( os.path.join(FastqcFolder, "{}_fastqc.stderr".format(library)), "a") subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr) sample_config["fastqc"] = FastqcFolder return sample_config
def picard_collectInsertSizeMetrics(global_config, sample_config, sorted_alignments_by_insert): picard = ""; if os.environ.get('PICARD_HOME'): picard = os.environ.get('PICARD_HOME') elif "picard" in global_config["Tools"]: picard = global_config["Tools"]["picard"]["bin"] for library, BAMfile, working_dir in sorted_alignments_by_insert: os.chdir(working_dir) output_header = os.path.basename(BAMfile).split(".bam")[0] histWide = library * 2 command= ["java", "-Xmx16g", "-XX:PermSize=2g", "-jar", os.path.join(picard, "CollectInsertSizeMetrics.jar"), "INPUT={}".format(BAMfile), "MINIMUM_PCT=0", "HISTOGRAM_FILE={}.collectInsertSize.pdf".format( output_header), "OUTPUT={}.collectInsertSize.txt".format(output_header), "HISTOGRAM_WIDTH={}".format(histWide), "VALIDATION_STRINGENCY=LENIENT", "TMP_DIR=$TMPDIR"] returnValue = 0; common.print_command(command) if not os.path.exists("{}.collectInsertSize.pdf".format( output_header)): if not common.check_dryrun(sample_config): stdOut = open("collectInsertSize.stdOut", "w") stdErr = open("collectInsertSize.stdErr", "w") returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) if not returnValue == 0: print("problem running CollectInsertSizeMetrics") os.chdir("..") return sorted_alignments_by_insert
def _run_BUSCO(global_config, sample_config, sorted_alignments_by_insert): program = global_config["Tools"]["BUSCO"]["bin"] options = global_config["Tools"]["BUSCO"]["options"] main_dir = os.getcwd() BUSCOfolder = os.path.join(main_dir, "BUSCO") if not os.path.exists(BUSCOfolder): os.makedirs(BUSCOfolder) os.chdir(BUSCOfolder) BUSCO_data_path = sample_config["BUSCODataPath"] if not os.path.exists(BUSCO_data_path): raise IOError("Path to the BUSCO data set does not exist!") reference = sample_config["reference"] output = sample_config["output"] threads = sample_config.get("threads", 16) command = [ program, "-l", BUSCO_data_path, "-in", "{}".format(reference), "-o", "{}".format(output), "-c", "{}".format(threads) ] command.extend(options) common.print_command(command) outfile = os.path.join(BUSCOfolder, "run_{}".format(output), "short_summary_{}".format(output)) if not common.check_dryrun(sample_config) and not os.path.exists(outfile): stdOut = open("BUSCO.stdOut", "a") stdErr = open("BUSCO.stdErr", "a") return_value = subprocess.call(command, stdout=stdOut, stderr=stdErr) if not return_value == 0: sys.exit("Error running BUSCO") os.chdir("..")
def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() FastqcFolder = os.path.join(os.getcwd(), "fastqc") if not os.path.exists(FastqcFolder): os.makedirs(FastqcFolder) program=global_config["Tools"]["fastqc"]["bin"] program_options=global_config["Tools"]["fastqc"]["options"] for library, libraryInfo in sorted_libraries_by_insert: command = [program] for option in program_options: command.append(option) read1=libraryInfo["pair1"] read2=libraryInfo["pair2"] command.append(read1) if read2 is not None: command.append(read2) common.print_command(command) sample_config["commands"] += "\n" + common.get_command_str(command) folder_output_name = os.path.join(FastqcFolder, os.path.basename(read1).split(".fastq.gz")[0]) if not common.check_dryrun(sample_config) and not \ os.path.exists("{}_fastqc.zip".format(folder_output_name)): fastq_stdOut = open(os.path.join(FastqcFolder, "{}_fastqc.stdout".format(library)), "a") fastq_stdErr = open(os.path.join(FastqcFolder, "{}_fastqc.stderr".format(library)), "a") subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr) sample_config["fastqc"] = FastqcFolder return sample_config
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer") if "kmer" not in sample_config: sys.exit("error in _run_abyss QCcontrol: kmer must be present in \ sample_config.yaml") kmer = sample_config["kmer"] if not os.path.exists(ABySS_Kmer_Folder): os.makedirs(ABySS_Kmer_Folder) os.chdir(ABySS_Kmer_Folder) program = global_config["Tools"]["abyss"]["bin"] program = os.path.join(os.path.dirname(program), "ABYSS-P") program_options=global_config["Tools"]["abyss"]["options"] if "abyss" in sample_config: program_options=sample_config["abyss"] threads = 16 # default for UPPMAX if "threads" in sample_config : threads = sample_config["threads"] command = "mpirun -np {} {} ".format(threads, program) command += "-k {} ".format(kmer) command += "--coverage-hist=histogram.hist -o preUnitgs.fa" for library, libraryInfo in sorted_libraries_by_insert: read1=libraryInfo["pair1"] read2=libraryInfo["pair2"] orientation = libraryInfo["orientation"] if orientation=="innie" or orientation=="outtie": command += " {} ".format(read1) if read2 is not None: command += " {} ".format(read2) if orientation == "none": command += " {} ".format(read1) common.print_command(command) sample_config["commands"] += "\n" + common.get_command_str(command) if not common.check_dryrun(sample_config) and not \ os.path.exists("histogram.hist"): ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a") ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a") returnValue = subprocess.call(command, shell=True, \ stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr) if returnValue > 0: print("ABySS kmer plotting failed: unkwnown reason") else : subprocess.call(("rm", "preUnitgs.fa")) _plotKmerFixed(1,200, kmer, "kmer_coverage_1_200.png") _plotKmerFixed(1,500, kmer, "kmer_coverage_1_500.png") _plotKmerFixed(15,200, kmer, "kmer_coverage_15_200.png") _plotKmerFixed(15,500, kmer, "kmer_coverage_15_500.png") _plotKmer(kmer, "kmer_coverage.png") os.chdir("..") sample_config["abyss"] = ABySS_Kmer_Folder return sample_config
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer") if "kmer" not in sample_config: sys.exit("error in _run_abyss QCcontrol: kmer must be present in \ sample_config.yaml") kmer = sample_config["kmer"] if not os.path.exists(ABySS_Kmer_Folder): os.makedirs(ABySS_Kmer_Folder) os.chdir(ABySS_Kmer_Folder) program = global_config["Tools"]["abyss"]["bin"] program = os.path.join(os.path.dirname(program), "ABYSS-P") program_options = global_config["Tools"]["abyss"]["options"] if "abyss" in sample_config: program_options = sample_config["abyss"] threads = 16 # default for UPPMAX if "threads" in sample_config: threads = sample_config["threads"] command = "mpirun -np {} {} ".format(threads, program) command += "-k {} ".format(kmer) command += "--coverage-hist=histogram.hist -o preUnitgs.fa" for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] if orientation == "innie" or orientation == "outtie": command += " {} ".format(read1) if read2 is not None: command += " {} ".format(read2) if orientation == "none": command += " {} ".format(read1) common.print_command(command) sample_config["commands"] += "\n" + common.get_command_str(command) if not common.check_dryrun(sample_config) and not \ os.path.exists("histogram.hist"): ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a") ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a") returnValue = subprocess.call(command, shell=True, \ stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr) if returnValue > 0: print("ABySS kmer plotting failed: unkwnown reason") else: subprocess.call(("rm", "preUnitgs.fa")) _plotKmerFixed(1, 200, kmer, "kmer_coverage_1_200.png") _plotKmerFixed(1, 500, kmer, "kmer_coverage_1_500.png") _plotKmerFixed(15, 200, kmer, "kmer_coverage_15_200.png") _plotKmerFixed(15, 500, kmer, "kmer_coverage_15_500.png") _plotKmer(kmer, "kmer_coverage.png") os.chdir("..") sample_config["abyss"] = ABySS_Kmer_Folder return sample_config
def build_reference_bwa(global_config, sample_config): #build the reference if not available reference = sample_config["reference"] program = "bwa" if "bwa" in global_config["Tools"]: program = global_config["Tools"]["bwa"]["bin"] elif not common.which("bwa"): sys.exit("error while trying to run bwa index: bwa not present in " "the path and not in global config, please make sure to " "install bwa properly") # check if reference provided exisists reference = os.path.abspath(reference) path_name, base_name = os.path.split(reference) index_path = os.path.join(base_name, "bwa", "{}.bwt".format(reference)) # check if I have already the bwt index if os.path.exists(index_path): #index already present, nothing to do return reference #otherwise I need to build the reference, in this case I build it locally if not os.path.exists(reference): sys.exit("error, reference file {} does not exists".format(reference)) # check if bwa index already created current_dir = os.getcwd() bwa_index_folder = os.path.join(path_name, "bwa") #if needed create directory if not os.path.exists(bwa_index_folder): os.makedirs(bwa_index_folder) os.chdir(bwa_index_folder) # if needed soft link the reference if not os.path.exists(base_name): #check and remove broken links if os.path.lexists(base_name): os.remove(base_name) returnValue = subprocess.call(["ln", "-s", reference, base_name]) if not returnValue == 0: sys.exit("error while trying to soft link reference sequence") # now I have a soflinked copy reference = os.path.join(path_name, "bwa", base_name) # now check if index alredy build or not if not os.path.exists("{}.bwt".format(reference)): # then create the index sequence bwa_stdOut = open("bwa_index.stdOut", "w") bwa_stdErr = open("bwa_index.stdErr", "w") command = [program, "index", reference] common.print_command(command) if not common.check_dryrun(sample_config): returnValue = subprocess.call(command, stdout=bwa_stdOut, stderr=bwa_stdErr) if not returnValue == 0: sys.exit("error, while indexing reference file {} " "with bwa index".format(reference)) #extra control to avoid problem with unexpected return value if not os.path.exists("{}.bwt".format(reference)): sys.exit("bwa index failed") os.chdir(current_dir) return reference
def _run_kmergenie(global_config, sample_config, sorted_libraries_by_insert): """Runs kmergenie to establish a recommended kmer size for assembly""" maindir = os.getcwd() kmerdir = os.path.join(maindir, "kmergenie") if not os.path.exists(kmerdir): os.makedirs(kmerdir) os.chdir(kmerdir) #Write a list of input fastq files for kmergenie kmer_input = os.path.join(kmerdir, "{}kmerinput.txt".format(sample_config.get("output",""))) program = global_config["Tools"]["kmergenie"]["bin"] program_options=global_config["Tools"]["kmergenie"]["options"] # Could be useful to add --diploid if sample is highly heterozygous if "kmergenie" in sample_config: program_options=sample_config["kmergenie"] threads = "" # Kmergenie will spawn number_of_cores - 1 threads by default if "threads" in sample_config : threads = sample_config["threads"] cmd_list = [program, kmer_input] for option in filter(None, program_options): cmd_list.append(option) if threads: cmd_list.append("-t {}".format(threads)) command = " ".join(cmd_list) common.print_command(command) sample_config["commands"] += "\n" + common.get_command_str(command) if not common.check_dryrun(sample_config): with open(kmer_input, "w") as f: for lib, lib_info in sorted_libraries_by_insert: f.write(lib_info["pair1"] + "\n") f.write(lib_info["pair2"] + "\n") stdOut = open("kmergenie.stdOut", "w") stdErr = open("kmergenie.stdErr", "w") returnValue = subprocess.call(cmd_list, stdout=stdOut, stderr=stdErr) if returnValue != 0: print("error while running command: {}".format(command)) else: _kmergenie_plot("histograms.dat") sample_config["kmergenie"] = kmerdir os.chdir(maindir) return sample_config
def _run_FRC(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() FRCurveFolder = os.path.join(os.getcwd(), "FRCurve") if not os.path.exists(FRCurveFolder): os.makedirs(FRCurveFolder) os.chdir("FRCurve") program=global_config["Tools"]["FRC"]["bin"] genomeSize = sample_config["genomeSize"] reference = sample_config["reference"] output = sample_config["output"] alignments = sample_config["alignments"] peBam = alignments[0][1] peInsert = alignments[0][0] peMinInsert = int(peInsert - peInsert*0.60) peMaxInsert = int(peInsert + peInsert*0.60) command = [program, "--pe-sam", peBam, "--pe-max-insert", "5000"] if len(alignments) > 1: mpBam = alignments[1][1] mpInsert = alignments[1][0] mpMinInsert = int(mpInsert - mpInsert*0.50) mpMaxInsert = int(mpInsert + mpInsert*0.50) command += ["--mp-sam", mpBam, "--mp-max-insert", "25000"] command += [ "--genome-size", "{}".format(genomeSize), "--output", output] common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists( "{}_FRC.png".format(output)): stdOut = open("FRC.stdOut", "a") stdErr = open("FRC.stdErr", "a") returnValue = subprocess.call(command , stdout=stdOut , stderr=stdErr) if not returnValue == 0: sys.exit("error, while running FRCurve: {}".format(command)) plotFRCurve(output) os.chdir("..") return sample_config
def _run_FRC(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() FRCurveFolder = os.path.join(os.getcwd(), "FRCurve") if not os.path.exists(FRCurveFolder): os.makedirs(FRCurveFolder) os.chdir("FRCurve") program = global_config["Tools"]["FRC"]["bin"] genomeSize = sample_config["genomeSize"] reference = sample_config["reference"] output = sample_config["output"] alignments = sample_config["alignments"] peBam = alignments[0][1] peInsert = alignments[0][0] peMinInsert = int(peInsert - peInsert * 0.60) peMaxInsert = int(peInsert + peInsert * 0.60) command = [program, "--pe-sam", peBam, "--pe-max-insert", "5000"] if len(alignments) > 1: mpBam = alignments[1][1] mpInsert = alignments[1][0] mpMinInsert = int(mpInsert - mpInsert * 0.50) mpMaxInsert = int(mpInsert + mpInsert * 0.50) command += ["--mp-sam", mpBam, "--mp-max-insert", "25000"] command += ["--genome-size", "{}".format(genomeSize), "--output", output] common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists( "{}_FRC.png".format(output)): stdOut = open("FRC.stdOut", "a") stdErr = open("FRC.stdErr", "a") returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) if not returnValue == 0: sys.exit("error, while running FRCurve: {}".format(command)) plotFRCurve(output) os.chdir("..") return sample_config
def _run_allpaths(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "allpaths" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in abyss case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure("allpaths", assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config inGroups_file = open("in_groups.csv", "w") inLibs_file = open("in_libs.csv", "w") inGroups_file.write("group_name, library_name, file_name\n") inLibs_file.write( "library_name, project_name, organism_name, type, " "paired, frag_size, frag_stddev, insert_size, insert_stddev, " "read_orientation,genomic_start, genomic_end\n") librariesForInLibs = [] librariesForInLibsDict = {} group_name = 1 for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation == "innie": path, fqfile = os.path.split(read1) if "_1.fastq" in fqfile: fqfile = fqfile.replace("_1.fastq", "_?.fastq") elif "_R1_" in fqfile: fqfile = fqfile.replace("_R1_", "_R?_") else: print("error file format not supported {}".format(fqfile)) return sample_config inGroups_file.write("PE{}, lib{}, {}\n".format( group_name, insert, os.path.join(path, fqfile))) group_name += 1 if insert not in librariesForInLibsDict: librariesForInLibsDict[insert] = insert librariesForInLibs.append( "lib{}, genome, genome, fragment, 1, " "{}, {}, , , inward, 0, 0\n".format(insert, insert, std)) elif orientation == "outtie": path, fqfile = os.path.split(read1) if "_1.fastq" in fqfile: fqfile = fqfile.replace("_1.fastq", "_?.fastq") elif "_R1_" in fqfile: fqfile = fqfile.replace("_R1_", "_R?_") else: print("error file format not supported {}".format(file)) return sample_config inGroups_file.write("MP{}, lib{}, {}\n".format( group_name, insert, os.path.join(path, fqfile))) group_name += 1 if insert not in librariesForInLibsDict: librariesForInLibsDict[insert] = insert librariesForInLibs.append( "lib{}, genome, genome, fragment, 1, " ", , {}, {}, outward, 0, 0\n".format(insert, insert, std)) else: print("all paths support only innies and outties") inGroups_file.close() for lib in librariesForInLibs: inLibs_file.write(lib) inLibs_file.close() #NOW RUN ALLPATHS FOR REAL program = os.path.join(programBIN, "PrepareAllPathsInputs.pl") os.mkdir("data_dir") data_dir = os.path.join(assemblyDirectory, "data_dir") ploidy = "PLOIDY=1" if len(program_options) > 0: if len(program_options) > 1: print("Running ALlpaths only one parameter accepted as option", "here: PLOIDY=2") return sample_config if program_options[0] == "PLOIDY=2": ploidy = "PLOIDY=2" else: print("Running ALlpaths only one parameter accepted as option", "here: PLOIDY=2") return sample_config command = [ program, "DATA_DIR={}".format(data_dir), ploidy, "PICARD_TOOLS_DIR={}".format(global_config["Tools"]["picard"]["bin"]), "FORCE_PHRED=True", "PHRED_64=False", "IN_GROUPS_CSV={}".format( os.path.join(assemblyDirectory, "in_groups.csv")), "IN_LIBS_CSV={}".format(os.path.join(assemblyDirectory, "in_libs.csv")) ] if common.check_dryrun(sample_config): common.print_command(command) program = os.path.join(programBIN, "RunAllPathsLG") command = [ program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run" ] common.print_command(command) os.chdir("..") return sample_config assembler_stdOut = open("allpaths_PrepareAllPathsInputs.stdOut", "w") assembler_stdErr = open("allpaths_PrepareAllPathsInputs.stdErr", "w") common.print_command(command) returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) assembler_stdOut.close() assembler_stdErr.close() flags = sample_config.get("flags", []) if returnValue == 0: program = os.path.join(programBIN, "RunAllPathsLG") command = [ program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run", "HAPLOIDIFY=True" ] common.print_command(command) assembler_stdOut = open("allpaths_RunAllPathsLG.stdOut", "w") assembler_stdErr = open("allpaths_RunAllPathsLG.stdErr", "w") returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) if returnValue != 0: print("ALLPATHS RunAllPathsLG terminated with an error. Please", "check running folder for more informations") os.chdir("..") return sample_config else: # save results assembly_dir = os.path.join("data_dir", "allpaths", "ASSEMBLIES", "run") if os.path.exists( os.path.join(assembly_dir, "final.assembly.fasta")): exit_code = subprocess.call([ "cp", os.path.join(assembly_dir, "final.contigs.fasta"), "{}.ctg.fasta".format(outputName) ]) exit_code += subprocess.call([ "cp", os.path.join(assembly_dir, "final.assembly.fasta"), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags and exit_code == 0: subprocess.call(["rm", "-r", "data_dir"]) else: print( "something wrong with Allpaths > no contig file generated") os.chdir("..") return sample_config else: print("ALLPATHS PrepareAllPathInputs terminated with an error. " "Please check running folder for more informations") os.chdir("..") return sample_config os.chdir("..") return sample_config
def _run_qaTools(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() qaToolsFolder = os.path.join(os.getcwd(), "QAstats") if not os.path.exists(qaToolsFolder): os.makedirs(qaToolsFolder) os.chdir("QAstats") program=global_config["Tools"]["qaTools"]["bin"] genomeSize = sample_config["genomeSize"] reference = sample_config["reference"] output = sample_config["output"] alignments = sample_config["alignments"][0] BAMfile = alignments[1] command = ["{}".format(program), "-m", "-q", "0", "-i", BAMfile, "{}.cov".format(os.path.basename(BAMfile))] common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists( "{}.cov".format(os.path.basename(BAMfile))): stdOut = open("QAtools.stdOut", "a") stdErr = open("QAtools.stdErr", "a") returnValue = subprocess.call(command , stdout=stdOut , stderr=stdErr) if not returnValue == 0: sys.exit("error, while running QAtools: {}".format(command)) #now add GC content QAtools_dict = {} header = "" with open( "{}.cov".format(os.path.basename(BAMfile)), "r") as QA_csv: header = QA_csv.readline().rstrip() for line in QA_csv: line = line.strip().split("\t") QAtools_dict[line[0]] = [line[1],line[2],line[3]] QA_GC_file = "{}.cov.gc".format(os.path.basename(BAMfile)) with open(QA_GC_file, "w") as QA_GC_fd: QA_GC_fd.write("{}\tGCperc\n".format(header)) with open(reference, "r") as ref_fd: fasta_raw_header = ref_fd.readline().strip() fasta_raw_header = fasta_raw_header.split(" ")[0] fasta_raw_header = fasta_raw_header.split("\t")[0] fasta_header = fasta_raw_header.split(">")[1] sequence = "" for line in ref_fd: line = line.strip() if line.startswith(">"): GC = computeGC(sequence) if fasta_header not in QAtools_dict: sys.exit("error while parsing QAcompute output: " "probably some wired contig name is " "present in your assmebly file") QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format( fasta_header, QAtools_dict[fasta_header][0], QAtools_dict[fasta_header][1], QAtools_dict[fasta_header][2], GC)) sequence = "" fasta_raw_header = line.split(" ")[0] fasta_raw_header = fasta_raw_header.split("\t")[0] fasta_header = fasta_raw_header.split(">")[1] else: sequence+=line GC = computeGC(sequence) if fasta_header not in QAtools_dict: sys.exit("error while parsing QAcompute output: probably " "some wired contig name is present in your " "assmebly file") QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format(fasta_header, QAtools_dict[fasta_header][0], QAtools_dict[fasta_header][1], QAtools_dict[fasta_header][2], GC)) plotQA(QA_GC_file) os.chdir("..") return sample_config
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "abyss" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in abyss case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure("abyss", assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART assembler_stdOut = open("abyss.stdOut", "a") assembler_stdErr = open("abyss.stdErr", "a") program = os.path.join(programBIN, "abyss-pe") command = "" command += "{} ".format(program) threads = 8 # default for UPPMAX if "threads" in sample_config: threads = sample_config["threads"] command += "np={} ".format(threads) kmer = 54 if "kmer" in sample_config: kmer = sample_config["kmer"] command += "k={} ".format(kmer) libraries = {} for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation == "innie" or orientation == "none": if read2 is None: # check if this is the first time I insert a se file if "se" not in libraries: libraries["se"] = "se=\'" libraries["se"] = libraries["se"] + read1 else: if not "lib" in libraries: libraries["lib"] = {} libName = insert # lib name is the insert size if not libName in libraries["lib"]: libraries["lib"][libName] = "" libraries["lib"][libName] += "{} {} ".format(read1, read2) else: if not "mp" in libraries: libraries["mp"] = {} libName = format(insert) if not libName in libraries["mp"]: libraries["mp"][libName] = "" libraries["mp"][libName] += "{} {} ".format(read1, read2) #now create the command command += "name={} ".format(outputName) librariesSE = "" librariesPE = "" librariesMP = "" if "se" in libraries: libraries["se"] = libraries["se"] + "\'" librariesSE = libraries["se"] if "lib" in libraries: lib = "lib=\'" for libPE, libPEreads in sorted(libraries["lib"].items()): lib = lib + "lib{} ".format(libPE) librariesPE += " lib{}=\'{}\' ".format(libPE, libPEreads) lib = lib + "\' " command += "{} ".format(lib) if "mp" in libraries: mp = "mp=\'" for libMP, libMPreads in sorted(libraries["mp"].items()): mp = mp + "lib{} ".format(libMP) librariesMP += " lib{}=\'{}\' ".format(libMP, libMPreads) mp = mp + "\' " command += "{} ".format(mp) command += "{} ".format(librariesSE) command += "{} ".format(librariesPE) command += "{} ".format(librariesMP) common.print_command(command) if common.check_dryrun(sample_config): os.chdir("..") return sample_config os.makedirs(os.path.join(assemblyDirectory, "runABySS")) os.chdir("runABySS") returnValue = 0 returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr, shell=True) os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0 and not common.check_dryrun(sample_config): if os.path.exists( os.path.join("runABySS", "{}-contigs.fa".format(outputName))): subprocess.call([ "cp", os.path.join("runABySS", "{}-contigs.fa".format(outputName)), "{}.ctg.fasta".format(outputName) ]) subprocess.call([ "cp", os.path.join("runABySS", "{}-scaffolds.fa".format(outputName)), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runABySS"]) elif not common.check_dryrun(sample_config): print("something wrong with ABySS -> no contig file generated") return sample_config else: print("ABySS terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def _run_spades(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "spades" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART command = "" command += "{} ".format(programBIN) for option in program_options: command += "{} ".format(option) #creates the command on-the-fly peLibrary = 1 mpLibrary = 1 for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation == "innie" or orientation == "none": if read2 is None: command += "--pe{}-s {} ".format(peLibrary, read1) else: command += "--pe{}-1 {} --pe{}-2 {} ".format( peLibrary, read1, peLibrary, read2) peLibrary += 1 elif orientation == "outtie": command += "--mp{}-1 {} --mp{}-2 {} ".format( mpLibrary, read1, mpLibrary, read2) mpLibrary += 1 else: print("orientation{} not supported.... why the program did not", "failed earlier?".format(orientation)) command += "-o {} ".format(outputName) common.print_command(command) returnValue = 0 if not common.check_dryrun(sample_config): assembler_stdOut = open("spades.stdOut", "a") assembler_stdErr = open("spades.stdErr", "a") returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr, shell=True) else: return sample_config flags = sample_config.get("flags", []) if returnValue == 0: if os.path.exists(os.path.join(outputName, "contigs.fasta")): subprocess.call([ "cp", os.path.join(outputName, "contigs.fasta"), "{}.ctg.fasta".format(outputName) ]) subprocess.call([ "cp", os.path.join(outputName, "scaffolds.fasta"), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", outputName]) else: print("something wrong with SPADES -> no contig file generated") else: print("SPADES terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def align_bwa_mem(global_config, read1, read2, reference, threads, dryrun): aligner = "bwa" if "bwa" in global_config["Tools"]: aligner = global_config["Tools"]["bwa"]["bin"] elif not common.which("bwa"): sys.exit("error while trying to run bwa mem: bwa not present in the " "path and not in global config, please make sure to install " "bwa properly") samtools = "samtools" if "samtools" in global_config["Tools"]: samtools = global_config["Tools"]["samtools"]["bin"] elif not common.which("samtools"): sys.exit("error while trying to run samtools: bwa not present in the " "path and not in global config, please make sure to install " "bwa properly") # extract base name libraryBase = "" if read2: libraryBase = os.path.basename(read1).split("_1.fastq")[0] else: libraryBase = os.path.basename(read1).split(".fastq")[0] if not os.path.exists(libraryBase): os.makedirs(libraryBase) os.chdir(libraryBase) mappingBase = "{}_to_{}".format(libraryBase, os.path.basename(reference).split(".fasta")[0]) BAMsorted = "{}.bam".format(mappingBase) BAMunsorted = "{}.unsorted.bam".format(mappingBase) SAMMapped = "{}.unsorted.sam".format(mappingBase) if os.path.exists(os.path.abspath(BAMsorted)): BAMsorted = os.path.abspath(BAMsorted) os.chdir("..") return BAMsorted bwa_mem_command = [aligner, "mem", "-M", "-t", "{}".format(threads), reference, read1, read2] samtools_view_command = [samtools, "view", "-b", "-S", "-u", "-"] if not os.path.exists(BAMunsorted): command = "{} | {} > {}".format(" ".join(bwa_mem_command), " ".join(samtools_view_command), BAMunsorted) bwa_stdOut = open("bwa.stdOut", "w") bwa_stdErr = open("bwa.stdErr", "w") common.print_command(command) if not dryrun: subprocess.call(command, shell=True, stdout=bwa_stdOut, stderr=bwa_stdErr) samtools_sort_command = [samtools, "sort", "-@", "{}".format(threads), "-m" , "1G", BAMunsorted, mappingBase] command = " ".join(samtools_sort_command) if not os.path.exists(BAMsorted): stdOut = open("sam_sort.stdOut", "w") stdErr = open("sam_sort.stdErr", "w") common.print_command(command) if not dryrun: subprocess.call(command, shell=True, stdout=stdOut, stderr=stdErr) if os.path.exists(BAMsorted) and os.path.exists(BAMunsorted): subprocess.call(["rm", BAMunsorted]) BAMsorted = os.path.abspath(BAMsorted) os.chdir("..") return BAMsorted
def _run_cabog(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "cabog" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART sys.path.insert(0, programBIN) libraries = 1 for library, libraryInfo in sorted_libraries_by_insert: command_fastqToCA = os.path.join(programBIN, "fastqToCA") read1=libraryInfo["pair1"] read2=libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] command_fastqToCA += " -libraryname " command_fastqToCA += " {}_{}".format(outputName, libraries) command_fastqToCA += " -insertsize " command_fastqToCA += " {} {} ".format(insert,std) command_fastqToCA += " -technology " command_fastqToCA += " illumina " command_fastqToCA += " -type " command_fastqToCA += " illumina " if orientation=="innie" or orientation=="none" : command_fastqToCA += " -innie " if read2 is None: command_fastqToCA += " -reads " command_fastqToCA += " {} ".format(read1) else: command_fastqToCA += " -mates " command_fastqToCA += " {},{} ".format(read1, read2) elif orientation=="outtie": command_fastqToCA += " -outtie " command_fastqToCA += " -mates " command_fastqToCA += " {},{} ".format(read1, read2) command_fastqToCA += " > " command_fastqToCA += " {}_{}.frg ".format(outputName, libraries) common.print_command(command_fastqToCA) if not common.check_dryrun(sample_config): cabog_stdOut = open("cabog_fastqToCA.stdOut", "w") cabog_stdErr = open("cabogfastqToCA.stdErr", "w") subprocess.call(command_fastqToCA, stderr=cabog_stdErr, shell=True) cabog_stdOut.close() cabog_stdErr.close() libraries += 1 command_runCA = os.path.join(programBIN, "runCA") command_runCA += " -d runCABOGfolder -p {} *frg".format(outputName) common.print_command(command_runCA) if common.check_dryrun(sample_config): return sample_config returnValue = 0 cabog_stdOut = open("cabog_runCA.stdOut", "w") cabog_stdErr = open("cabog_runCA.stdErr", "w") returnValue = subprocess.call(command_runCA, stdout=cabog_stdOut, stderr=cabog_stdErr, shell=True) flags = sample_config.get("flags", []) if returnValue == 0: #assembly succed, remove files and save assembly if os.path.exists(os.path.join("runCABOGfolder","9-terminator", "{}.ctg.fasta".format(outputName))): subprocess.call(["cp", os.path.join("runCABOGfolder","9-terminator", "{}.ctg.fasta".format(outputName)), "{}.ctg.fasta".format(outputName)]) subprocess.call(["cp", os.path.join("runCABOGfolder","9-terminator", "{}.scf.fasta".format(outputName)), "{}.scf.fasta".format(outputName)]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runCABOGfolder"]) else: print("something wrong with CABOG -> no contig file generated") else: print("CABOG terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "abyss" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in abyss case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure("abyss", assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART assembler_stdOut = open("abyss.stdOut", "a") assembler_stdErr = open("abyss.stdErr", "a") program=os.path.join(programBIN, "abyss-pe") command = "" command += "{} ".format(program) threads = 8 # default for UPPMAX if "threads" in sample_config : threads = sample_config["threads"] command += "np={} ".format(threads) kmer = 54 if "kmer" in sample_config: kmer = sample_config["kmer"] command += "k={} ".format(kmer) libraries = {} for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation=="innie" or orientation=="none": if read2 is None: # check if this is the first time I insert a se file if "se" not in libraries: libraries["se"] = "se=\'" libraries["se"] = libraries["se"] + read1 else: if not "lib" in libraries: libraries["lib"] = {} libName = insert # lib name is the insert size if not libName in libraries["lib"]: libraries["lib"][libName] = "" libraries["lib"][libName] += "{} {} ".format(read1, read2) else: if not "mp" in libraries: libraries["mp"] = {} libName = format(insert) if not libName in libraries["mp"]: libraries["mp"][libName] = "" libraries["mp"][libName] += "{} {} ".format(read1, read2) #now create the command command += "name={} ".format(outputName) librariesSE = "" librariesPE = "" librariesMP = "" if "se" in libraries: libraries["se"] = libraries["se"] + "\'" librariesSE = libraries["se"] if "lib" in libraries: lib="lib=\'" for libPE, libPEreads in sorted(libraries["lib"].items()): lib = lib + "lib{} ".format(libPE) librariesPE += " lib{}=\'{}\' ".format(libPE,libPEreads) lib=lib + "\' " command += "{} ".format(lib) if "mp" in libraries: mp="mp=\'" for libMP, libMPreads in sorted(libraries["mp"].items()): mp = mp + "lib{} ".format(libMP) librariesMP += " lib{}=\'{}\' ".format(libMP,libMPreads) mp=mp + "\' " command += "{} ".format(mp) command += "{} ".format(librariesSE) command += "{} ".format(librariesPE) command += "{} ".format(librariesMP) common.print_command(command) if common.check_dryrun(sample_config): os.chdir("..") return sample_config os.makedirs(os.path.join(assemblyDirectory, "runABySS")) os.chdir("runABySS") returnValue = 0 returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr, shell=True) os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0 and not common.check_dryrun(sample_config): if os.path.exists(os.path.join("runABySS","{}-contigs.fa".format( outputName))): subprocess.call(["cp", os.path.join("runABySS", "{}-contigs.fa".format(outputName)), "{}.ctg.fasta".format(outputName) ]) subprocess.call(["cp", os.path.join("runABySS", "{}-scaffolds.fa".format(outputName)), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runABySS"]) elif not common.check_dryrun(sample_config): print("something wrong with ABySS -> no contig file generated") return sample_config else: print("ABySS terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def _run_cabog(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "cabog" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART sys.path.insert(0, programBIN) libraries = 1 for library, libraryInfo in sorted_libraries_by_insert: command_fastqToCA = os.path.join(programBIN, "fastqToCA") read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] command_fastqToCA += " -libraryname " command_fastqToCA += " {}_{}".format(outputName, libraries) command_fastqToCA += " -insertsize " command_fastqToCA += " {} {} ".format(insert, std) command_fastqToCA += " -technology " command_fastqToCA += " illumina " command_fastqToCA += " -type " command_fastqToCA += " illumina " if orientation == "innie" or orientation == "none": command_fastqToCA += " -innie " if read2 is None: command_fastqToCA += " -reads " command_fastqToCA += " {} ".format(read1) else: command_fastqToCA += " -mates " command_fastqToCA += " {},{} ".format(read1, read2) elif orientation == "outtie": command_fastqToCA += " -outtie " command_fastqToCA += " -mates " command_fastqToCA += " {},{} ".format(read1, read2) command_fastqToCA += " > " command_fastqToCA += " {}_{}.frg ".format(outputName, libraries) common.print_command(command_fastqToCA) if not common.check_dryrun(sample_config): cabog_stdOut = open("cabog_fastqToCA.stdOut", "w") cabog_stdErr = open("cabogfastqToCA.stdErr", "w") subprocess.call(command_fastqToCA, stderr=cabog_stdErr, shell=True) cabog_stdOut.close() cabog_stdErr.close() libraries += 1 command_runCA = os.path.join(programBIN, "runCA") command_runCA += " -d runCABOGfolder -p {} *frg".format(outputName) common.print_command(command_runCA) if common.check_dryrun(sample_config): return sample_config returnValue = 0 cabog_stdOut = open("cabog_runCA.stdOut", "w") cabog_stdErr = open("cabog_runCA.stdErr", "w") returnValue = subprocess.call(command_runCA, stdout=cabog_stdOut, stderr=cabog_stdErr, shell=True) flags = sample_config.get("flags", []) if returnValue == 0: #assembly succed, remove files and save assembly if os.path.exists( os.path.join("runCABOGfolder", "9-terminator", "{}.ctg.fasta".format(outputName))): subprocess.call([ "cp", os.path.join("runCABOGfolder", "9-terminator", "{}.ctg.fasta".format(outputName)), "{}.ctg.fasta".format(outputName) ]) subprocess.call([ "cp", os.path.join("runCABOGfolder", "9-terminator", "{}.scf.fasta".format(outputName)), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runCABOGfolder"]) else: print("something wrong with CABOG -> no contig file generated") else: print("CABOG terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def _run_qaTools(global_config, sample_config, sorted_libraries_by_insert): mainDir = os.getcwd() qaToolsFolder = os.path.join(os.getcwd(), "QAstats") if not os.path.exists(qaToolsFolder): os.makedirs(qaToolsFolder) os.chdir("QAstats") program = global_config["Tools"]["qaTools"]["bin"] genomeSize = sample_config["genomeSize"] reference = sample_config["reference"] output = sample_config["output"] alignments = sample_config["alignments"][0] BAMfile = alignments[1] command = [ "{}".format(program), "-m", "-q", "0", "-i", BAMfile, "{}.cov".format(os.path.basename(BAMfile)) ] common.print_command(command) if not common.check_dryrun(sample_config) and not os.path.exists( "{}.cov".format(os.path.basename(BAMfile))): stdOut = open("QAtools.stdOut", "a") stdErr = open("QAtools.stdErr", "a") returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) if not returnValue == 0: sys.exit("error, while running QAtools: {}".format(command)) #now add GC content QAtools_dict = {} header = "" with open("{}.cov".format(os.path.basename(BAMfile)), "r") as QA_csv: header = QA_csv.readline().rstrip() for line in QA_csv: line = line.strip().split("\t") QAtools_dict[line[0]] = [line[1], line[2], line[3]] QA_GC_file = "{}.cov.gc".format(os.path.basename(BAMfile)) with open(QA_GC_file, "w") as QA_GC_fd: QA_GC_fd.write("{}\tGCperc\n".format(header)) with open(reference, "r") as ref_fd: fasta_raw_header = ref_fd.readline().strip() fasta_raw_header = fasta_raw_header.split(" ")[0] fasta_raw_header = fasta_raw_header.split("\t")[0] fasta_header = fasta_raw_header.split(">")[1] sequence = "" for line in ref_fd: line = line.strip() if line.startswith(">"): GC = computeGC(sequence) if fasta_header not in QAtools_dict: sys.exit("error while parsing QAcompute output: " "probably some wired contig name is " "present in your assmebly file") QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format( fasta_header, QAtools_dict[fasta_header][0], QAtools_dict[fasta_header][1], QAtools_dict[fasta_header][2], GC)) sequence = "" fasta_raw_header = line.split(" ")[0] fasta_raw_header = fasta_raw_header.split("\t")[0] fasta_header = fasta_raw_header.split(">")[1] else: sequence += line GC = computeGC(sequence) if fasta_header not in QAtools_dict: sys.exit("error while parsing QAcompute output: probably " "some wired contig name is present in your " "assmebly file") QA_GC_fd.write("{}\t{}\t{}\t{}\t{}\n".format( fasta_header, QAtools_dict[fasta_header][0], QAtools_dict[fasta_header][1], QAtools_dict[fasta_header][2], GC)) plotQA(QA_GC_file) os.chdir("..") return sample_config
def _run_masurca(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "masurca" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART masurca_config_file = open("configuration.txt", "w") masurca_config_file.write("DATA\n") allTheLetters = string.lowercase libraryPE = "p" libraryPEnum = 0 libraryMP = "m" libraryMPnum = 0 #TODO: single ended reads for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation == "innie": if read2 is not None: configurationLine = "PE = {}{} {} {} {} {}".format( libraryPE, allTheLetters[libraryPEnum], insert, std, read1, read2) masurca_config_file.write("{}\n".format(configurationLine)) libraryPEnum += 1 #TODO: check when more than 21 PE libraries ae specified elif orientation == "outtie": configurationLine = "JUMP = {}{} {} {} {} {}".format( libraryMP, allTheLetters[libraryMPnum], insert, std, read1, read2) masurca_config_file.write("{}\n".format(configurationLine)) libraryMPnum += 1 #TODO: check when more than 21 PE libraries ae specified masurca_config_file.write("END\n") masurca_config_file.write("\n") masurca_config_file.write("PARAMETERS\n") #this is k-mer size for deBruijn graph values between 25 and 101 are #supported, auto will compute the optimal size based on the read data #and GC content masurca_config_file.write("GRAPH_KMER_SIZE=auto\n") #set this to 1 for Illumina-only assemblies and to 0 if you have 2x or #more long (Sanger, 454) reads masurca_config_file.write("USE_LINKING_MATES=1\n") #this parameter is useful if you have too many jumping library mates. #See manual for explanation about settings based on genome length if sample_config["genomeSize"] > 10000000: masurca_config_file.write("LIMIT_JUMP_COVERAGE = 1000\n") else: masurca_config_file.write("LIMIT_JUMP_COVERAGE = 60\n") #these are the additional parameters to Celera Assembler. do not worry #about performance, number or processors or batch sizes -- these are #computed automatically. for mammals do not set cgwErrorRate above 0.15!!! if sample_config["genomeSize"] > 1500000000: masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \ cgwErrorRate=0.15 ovlMemory=4GB\n") else: masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \ cgwErrorRate=0.25 ovlMemory=4GB\n") #auto-detected number of cpus to use threads = 8 # default for UPPMAX if "threads" in sample_config: threads = sample_config["threads"] masurca_config_file.write("NUM_THREADS= {}\n".format(threads)) #this is mandatory jellyfish hash size ---- jellyfish hash size, #set this to about 10x the genome size. JF_SIZE = sample_config["genomeSize"] * 11 masurca_config_file.write("JF_SIZE={}\n".format(JF_SIZE)) #this specifies if we do (1) or do not (0) want to trim long runs of #homopolymers (e.g. GGGGGGGG) from 3' read ends, use it for high GC genomes masurca_config_file.write("DO_HOMOPOLYMER_TRIM=0\n") masurca_config_file.write("END\n") masurca_config_file.write("\n") masurca_config_file.close() if common.check_dryrun(sample_config): os.chdir("..") return sample_config masurca_stdOut = open("masurca.stdOut", "w") masurca_stdErr = open("masurca.stdErr", "w") os.mkdir("runMASURCA") os.chdir("runMASURCA") command = [os.path.join(programBIN, "bin/masurca"), "../configuration.txt"] common.print_command(command) subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr) if not os.path.exists("assemble.sh"): print("MaSuRCA: assemble.sh not created. Unknown failure") return sample_config command = ["./assemble.sh"] common.print_command(command) returnValue = subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr) os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0: if os.path.exists( os.path.join("runMASURCA", "CA/10-gapclose/genome.scf.fasta")): subprocess.call([ "cp", os.path.join("runMASURCA", "CA/10-gapclose/genome.ctg.fasta"), "{}.ctg.fasta".format(outputName) ]) subprocess.call([ "cp", os.path.join("runMASURCA", "CA/10-gapclose/genome.scf.fasta"), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runMASURCA"]) else: print("something wrong with MaSuRCA -> no contig file generated") else: print("MaSuRCA terminated with an error. Please check running folder", "for more informations") return sample_config os.chdir("..") return sample_config
def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert): program = global_config["Tools"]["trimmomatic"]["bin"] program_folder = os.path.dirname(program) if "adapters" not in sample_config: sys.exit("running MP pipeline, adapters file to be used in trimming" "are needed for Trimmomatic. Please specify them" "in the sample configuration file and rerun") adapterFile = sample_config["adapters"] if not os.path.exists(adapterFile): sys.exit("Trimmomatic cannot be run as adapter file is not specified" "or points to unknown position: {}".format(adapterFile)) mainDirectory = os.getcwd() trimmomaticDir = os.path.join(mainDirectory, "Trimmomatic") if not os.path.exists(trimmomaticDir): os.makedirs(trimmomaticDir) os.chdir(trimmomaticDir) #now I am in running dir, I need to process one by one the libraries threads = 8 if "threads" in sample_config: threads = sample_config["threads"] for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] if read2 is not None: read1_baseName = os.path.split(read1)[1].split(".")[0] read2_baseName = os.path.split(read2)[1].split(".")[0] output_read1_pair = os.path.join( trimmomaticDir, "{}.fastq.gz".format(read1_baseName)) output_read1_sing = os.path.join( trimmomaticDir, "{}_u.fastq.gz".format(read1_baseName)) output_read2_pair = os.path.join( trimmomaticDir, "{}.fastq.gz".format(read2_baseName)) output_read2_sing = os.path.join( trimmomaticDir, "{}_u.fastq.gz".format(read2_baseName)) command = [ "java", "-jar", program, "PE", "-threads", "{}".format(threads), "-phred33", read1, read2, output_read1_pair, output_read1_sing, output_read2_pair, output_read2_sing, "ILLUMINACLIP:{}:2:30:10".format(adapterFile), "LEADING:3", "TRAILING:3", "SLIDINGWINDOW:4:15", "MINLEN:30" ] common.print_command(command) sample_config["commands"] += "\n" + common.get_command_str(command) # do not execute is files have been already gennerated if not common.check_dryrun(sample_config) and not \ os.path.exists(output_read1_pair): stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName), "w") stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName), "w") returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) # run the program if returnValue != 0: print("error while running command: {}".format(command)) libraryInfo["pair1"] = output_read1_pair libraryInfo["pair2"] = output_read2_pair libraryInfo["trimmomatic"] = os.path.join( trimmomaticDir, "{}_trimmomatic.stdErr".format(read1_baseName)) os.chdir(mainDirectory) return sample_config
def _run_masurca(global_config, sample_config,sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "masurca" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART masurca_config_file = open("configuration.txt", "w") masurca_config_file.write("DATA\n") allTheLetters = string.lowercase libraryPE = "p" libraryPEnum = 0 libraryMP = "m" libraryMPnum = 0 #TODO: single ended reads for library, libraryInfo in sorted_libraries_by_insert: read1=libraryInfo["pair1"] read2=libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation=="innie": if read2 is not None: configurationLine = "PE = {}{} {} {} {} {}".format(libraryPE, allTheLetters[libraryPEnum], insert, std, read1, read2) masurca_config_file.write("{}\n".format(configurationLine)) libraryPEnum += 1 #TODO: check when more than 21 PE libraries ae specified elif orientation=="outtie": configurationLine = "JUMP = {}{} {} {} {} {}".format(libraryMP, allTheLetters[libraryMPnum], insert, std, read1, read2) masurca_config_file.write("{}\n".format(configurationLine)) libraryMPnum += 1 #TODO: check when more than 21 PE libraries ae specified masurca_config_file.write("END\n") masurca_config_file.write("\n") masurca_config_file.write("PARAMETERS\n") #this is k-mer size for deBruijn graph values between 25 and 101 are #supported, auto will compute the optimal size based on the read data #and GC content masurca_config_file.write("GRAPH_KMER_SIZE=auto\n") #set this to 1 for Illumina-only assemblies and to 0 if you have 2x or #more long (Sanger, 454) reads masurca_config_file.write("USE_LINKING_MATES=1\n") #this parameter is useful if you have too many jumping library mates. #See manual for explanation about settings based on genome length if sample_config["genomeSize"] > 10000000: masurca_config_file.write("LIMIT_JUMP_COVERAGE = 1000\n") else: masurca_config_file.write("LIMIT_JUMP_COVERAGE = 60\n") #these are the additional parameters to Celera Assembler. do not worry #about performance, number or processors or batch sizes -- these are #computed automatically. for mammals do not set cgwErrorRate above 0.15!!! if sample_config["genomeSize"] > 1500000000: masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \ cgwErrorRate=0.15 ovlMemory=4GB\n") else: masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \ cgwErrorRate=0.25 ovlMemory=4GB\n") #auto-detected number of cpus to use threads = 8 # default for UPPMAX if "threads" in sample_config : threads = sample_config["threads"] masurca_config_file.write("NUM_THREADS= {}\n".format(threads)) #this is mandatory jellyfish hash size ---- jellyfish hash size, #set this to about 10x the genome size. JF_SIZE = sample_config["genomeSize"] * 11 masurca_config_file.write("JF_SIZE={}\n".format(JF_SIZE)) #this specifies if we do (1) or do not (0) want to trim long runs of #homopolymers (e.g. GGGGGGGG) from 3' read ends, use it for high GC genomes masurca_config_file.write("DO_HOMOPOLYMER_TRIM=0\n") masurca_config_file.write("END\n") masurca_config_file.write("\n") masurca_config_file.close() if common.check_dryrun(sample_config): os.chdir("..") return sample_config masurca_stdOut = open("masurca.stdOut", "w") masurca_stdErr = open("masurca.stdErr", "w") os.mkdir("runMASURCA") os.chdir("runMASURCA") command = [os.path.join(programBIN,"bin/masurca") , "../configuration.txt"] common.print_command(command) subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr) if not os.path.exists("assemble.sh"): print("MaSuRCA: assemble.sh not created. Unknown failure") return sample_config command = ["./assemble.sh"] common.print_command(command) returnValue = subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr) os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0: if os.path.exists(os.path.join( "runMASURCA","CA/10-gapclose/genome.scf.fasta")): subprocess.call(["cp", os.path.join( "runMASURCA","CA/10-gapclose/genome.ctg.fasta"), "{}.ctg.fasta".format(outputName) ]) subprocess.call(["cp", os.path.join( "runMASURCA","CA/10-gapclose/genome.scf.fasta"), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runMASURCA"]) else: print("something wrong with MaSuRCA -> no contig file generated") else: print("MaSuRCA terminated with an error. Please check running folder", "for more informations") return sample_config os.chdir("..") return sample_config
def _merge_bam_files(global_config, sample_config, sorted_libraries_by_insert): BAMfiles = {}; reference = sample_config["reference"] samtools = "samtools" if "samtools" in global_config["Tools"]: samtools = global_config["Tools"]["samtools"]["bin"] elif not common.which("samtools"): sys.exit("error while trying to run samtools: bwa not present in the " "path and not in global config, please make sure to install " "bwa properly") numInserts = 0 for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] alignment = libraryInfo["alignment"] if insert not in BAMfiles: BAMfiles[insert] = [alignment] numInserts += 1 else: BAMfiles[insert].append(alignment) BAMfilesMerged = {} for insert, insertGroup in BAMfiles.items(): dir_insert = "lib_{}".format(insert) if numInserts == 1: dir_insert = sample_config["output"] if not os.path.exists(dir_insert): os.makedirs(dir_insert) os.chdir(dir_insert) #check if file is already present bamMerged = "lib_{}.bam".format(insert) if numInserts == 1: bamMerged = "{}.bam".format(sample_config["output"]) if os.path.exists(bamMerged): BAMfilesMerged[insert] = [os.path.abspath(bamMerged), dir_insert] os.chdir("..") continue # nothiing to be done for this insert if len(insertGroup) == 1: # only one sample file for this insert length cl = ["ln", "-s", insertGroup[0], bamMerged] returnValue = subprocess.call(cl) if not returnValue == 0: sys.exit("error, while soft linking {}".format(insertGroup[0])) else: command = [samtools, "merge",bamMerged] for bamfile in insertGroup: command.append(bamfile) common.print_command(command) returnValue = 0 if not common.check_dryrun(sample_config): returnValue = subprocess.call(command) if not returnValue == 0: sys.exit("error, while merging files {}".format( insertGroup)) BAMfilesMerged[insert] = [os.path.abspath(bamMerged), dir_insert] os.chdir("..") sorted_alignments_by_insert = [] for key in sorted(BAMfilesMerged.keys()): sorted_alignments_by_insert.append([key, BAMfilesMerged[key][0], BAMfilesMerged[key][1]]) # memorise insert length, bam file, folder return sorted_alignments_by_insert
def _run_soapdenovo(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "soapdenovo" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART kmer = 54 if "kmer" in sample_config: kmer = sample_config["kmer"] threads = ["-p", "8"] # default for UPPMAX if "threads" in sample_config: threads = ["-p", "{}".format(sample_config["threads"])] soap_config_file = open("configuration.txt", "w") soap_config_file.write("max_rd_len=150\n") #TODO make this a parameter in the options rank = 1 for library, libraryInfo in sorted_libraries_by_insert: soap_config_file.write("[LIB]\n") read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] soap_config_file.write("avg_ins={}\n".format(insert)) soap_config_file.write("rank={}\n".format(rank)) rank += 1 soap_config_file.write("map_len=30\n") if orientation=="innie" or orientation=="none": soap_config_file.write("asm_flags=3\n") soap_config_file.write("pair_num_cutoff=3\n") soap_config_file.write("reverse_seq=0\n") if read2 is None: soap_config_file.write("q={}\n".format(read1)) else: soap_config_file.write("q1={}\n".format(read1)) soap_config_file.write("q2={}\n".format(read2)) elif orientation=="outtie": soap_config_file.write("asm_flags=2\n") soap_config_file.write("pair_num_cutoff=5\n") soap_config_file.write("reverse_seq=1\n") soap_config_file.write("q1={}\n".format(read1)) soap_config_file.write("q2={}\n".format(read2)) soap_config_file.close() assembler_stdOut = open("soap.stdOut", "w") assembler_stdErr = open("soap.stdErr", "w") os.makedirs(os.path.join(assemblyDirectory, "runSOAP")) os.chdir("runSOAP") #TODO : lots of missing options command = [programBIN , "all", "-s", "{}".format(os.path.join(assemblyDirectory, "configuration.txt")), "-K", "{}".format(kmer), "-L", "500", "-o", "soapAssembly", threads[0], threads[1] ] common.print_command(command) returnValue = 0 if not common.check_dryrun(sample_config): subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) else: os.chdir("..") os.chdir("..") return sample_config os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0: if(os.path.exists(os.path.join("runSOAP","soapAssembly.scafSeq"))): subprocess.call(["cp", os.path.join("runSOAP", "soapAssembly.scafSeq"), "{}.scf.fasta".format(outputName)]) subprocess.call(["cp", os.path.join("runSOAP", "soapAssembly.contig"), "{}.ctg.fasta".format(outputName)]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runSOAP"]) else: print("something wrong with SOAPdenovo -> no contig file generated") else: print("SOAPdenovo terminated with an error. Please check running", "folder for more informations") os.chdir("..") return sample_config os.chdir("..") return sample_config
def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert): program = global_config["Tools"]["trimmomatic"]["bin"] program_folder = os.path.dirname(program) if "adapters" not in sample_config: sys.exit("running MP pipeline, adapters file to be used in trimming" "are needed for Trimmomatic. Please specify them" "in the sample configuration file and rerun") adapterFile = sample_config["adapters"] if not os.path.exists(adapterFile): sys.exit("Trimmomatic cannot be run as adapter file is not specified" "or points to unknown position: {}".format(adapterFile)) mainDirectory = os.getcwd() trimmomaticDir = os.path.join(mainDirectory, "Trimmomatic") if not os.path.exists(trimmomaticDir): os.makedirs(trimmomaticDir) os.chdir(trimmomaticDir) #now I am in running dir, I need to process one by one the libraries threads = 8 if "threads" in sample_config: threads = sample_config["threads"] for library, libraryInfo in sorted_libraries_by_insert: read1=libraryInfo["pair1"] read2=libraryInfo["pair2"] orientation = libraryInfo["orientation"] if read2 is not None: read1_baseName = os.path.split(read1)[1].split(".")[0] read2_baseName = os.path.split(read2)[1].split(".")[0] output_read1_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read1_baseName)) output_read1_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read1_baseName)) output_read2_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read2_baseName)) output_read2_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read2_baseName)) command = ["java", "-jar", program, "PE", "-threads", "{}".format(threads), "-phred33", read1, read2, output_read1_pair, output_read1_sing, output_read2_pair, output_read2_sing, "ILLUMINACLIP:{}:2:30:10".format(adapterFile), "LEADING:3", "TRAILING:3", "SLIDINGWINDOW:4:15", "MINLEN:30"] common.print_command(command) sample_config["commands"] += "\n" + common.get_command_str(command) # do not execute is files have been already gennerated if not common.check_dryrun(sample_config) and not \ os.path.exists(output_read1_pair): stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName), "w") stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName), "w") returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) # run the program if returnValue != 0: print("error while running command: {}".format(command)) libraryInfo["pair1"] = output_read1_pair libraryInfo["pair2"] = output_read2_pair libraryInfo["trimmomatic"] = os.path.join(trimmomaticDir, "{}_trimmomatic.stdErr".format(read1_baseName)) os.chdir(mainDirectory) return sample_config
def _run_spades(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "spades" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART command = "" command += "{} ".format(programBIN) for option in program_options: command += "{} ".format(option) #creates the command on-the-fly peLibrary = 1 mpLibrary = 1 for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation=="innie" or orientation=="none": if read2 is None: command += "--pe{}-s {} ".format(peLibrary, read1) else: command += "--pe{}-1 {} --pe{}-2 {} ".format(peLibrary, read1, peLibrary, read2) peLibrary += 1 elif orientation=="outtie": command += "--mp{}-1 {} --mp{}-2 {} ".format(mpLibrary, read1, mpLibrary, read2) mpLibrary += 1 else: print("orientation{} not supported.... why the program did not", "failed earlier?".format(orientation)) command += "-o {} ".format(outputName) common.print_command(command) returnValue = 0 if not common.check_dryrun(sample_config): assembler_stdOut = open("spades.stdOut", "a") assembler_stdErr = open("spades.stdErr", "a") returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr, shell=True) else: return sample_config flags = sample_config.get("flags", []) if returnValue == 0: if os.path.exists(os.path.join(outputName,"contigs.fasta")): subprocess.call(["cp", os.path.join(outputName,"contigs.fasta"), "{}.ctg.fasta".format(outputName)]) subprocess.call(["cp", os.path.join(outputName,"scaffolds.fasta"), "{}.scf.fasta".format(outputName)]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", outputName]) else: print("something wrong with SPADES -> no contig file generated") else: print("SPADES terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def _run_allpaths(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "allpaths" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in abyss case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure("allpaths", assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config inGroups_file = open("in_groups.csv", "w") inLibs_file = open("in_libs.csv", "w") inGroups_file.write("group_name, library_name, file_name\n") inLibs_file.write("library_name, project_name, organism_name, type, " "paired, frag_size, frag_stddev, insert_size, insert_stddev, " "read_orientation,genomic_start, genomic_end\n") librariesForInLibs = [] librariesForInLibsDict = {} group_name = 1; for library, libraryInfo in sorted_libraries_by_insert: read1 =libraryInfo["pair1"] read2 =libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation=="innie": path, fqfile=os.path.split(read1) if "_1.fastq" in fqfile: fqfile = fqfile.replace("_1.fastq", "_?.fastq") elif "_R1_" in fqfile: fqfile = fqfile.replace("_R1_", "_R?_") else: print("error file format not supported {}".format(fqfile)) return sample_config inGroups_file.write("PE{}, lib{}, {}\n".format(group_name, insert, os.path.join(path, fqfile))) group_name += 1 if insert not in librariesForInLibsDict: librariesForInLibsDict[insert] = insert librariesForInLibs.append("lib{}, genome, genome, fragment, 1, " "{}, {}, , , inward, 0, 0\n".format(insert,insert, std)) elif orientation=="outtie": path, fqfile = os.path.split(read1) if "_1.fastq" in fqfile: fqfile = fqfile.replace("_1.fastq", "_?.fastq") elif "_R1_" in fqfile: fqfile = fqfile.replace("_R1_", "_R?_") else: print("error file format not supported {}".format(file)) return sample_config inGroups_file.write("MP{}, lib{}, {}\n".format(group_name, insert, os.path.join(path, fqfile))) group_name += 1 if insert not in librariesForInLibsDict: librariesForInLibsDict[insert] = insert librariesForInLibs.append("lib{}, genome, genome, fragment, 1, " ", , {}, {}, outward, 0, 0\n".format(insert,insert, std)) else: print("all paths support only innies and outties") inGroups_file.close() for lib in librariesForInLibs: inLibs_file.write(lib) inLibs_file.close() #NOW RUN ALLPATHS FOR REAL program=os.path.join(programBIN, "PrepareAllPathsInputs.pl") os.mkdir("data_dir") data_dir = os.path.join(assemblyDirectory, "data_dir") ploidy = "PLOIDY=1" if len(program_options) > 0: if len(program_options) >1: print("Running ALlpaths only one parameter accepted as option", "here: PLOIDY=2") return sample_config if program_options[0] == "PLOIDY=2": ploidy = "PLOIDY=2" else: print("Running ALlpaths only one parameter accepted as option", "here: PLOIDY=2") return sample_config command = [program , "DATA_DIR={}".format(data_dir), ploidy, "PICARD_TOOLS_DIR={}".format( global_config["Tools"]["picard"]["bin"]), "FORCE_PHRED=True", "PHRED_64=False", "IN_GROUPS_CSV={}".format(os.path.join(assemblyDirectory,"in_groups.csv")), "IN_LIBS_CSV={}".format(os.path.join(assemblyDirectory,"in_libs.csv"))] if common.check_dryrun(sample_config): common.print_command(command) program = os.path.join(programBIN, "RunAllPathsLG") command = [program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run"] common.print_command(command) os.chdir("..") return sample_config assembler_stdOut = open("allpaths_PrepareAllPathsInputs.stdOut", "w") assembler_stdErr = open("allpaths_PrepareAllPathsInputs.stdErr", "w") common.print_command(command) returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) assembler_stdOut.close() assembler_stdErr.close() flags = sample_config.get("flags", []) if returnValue == 0: program = os.path.join(programBIN, "RunAllPathsLG") command = [program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run", "HAPLOIDIFY=True"] common.print_command(command) assembler_stdOut = open("allpaths_RunAllPathsLG.stdOut", "w") assembler_stdErr = open("allpaths_RunAllPathsLG.stdErr", "w") returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) if returnValue != 0: print("ALLPATHS RunAllPathsLG terminated with an error. Please", "check running folder for more informations") os.chdir("..") return sample_config else: # save results assembly_dir = os.path.join("data_dir", "allpaths", "ASSEMBLIES", "run") if os.path.exists(os.path.join(assembly_dir, "final.assembly.fasta")): exit_code = subprocess.call(["cp", os.path.join(assembly_dir, "final.contigs.fasta"), "{}.ctg.fasta".format(outputName)]) exit_code += subprocess.call(["cp", os.path.join(assembly_dir, "final.assembly.fasta"), "{}.scf.fasta".format(outputName)]) if not "keep_tmp_files" in flags and exit_code == 0: subprocess.call(["rm", "-r", "data_dir"]) else: print("something wrong with Allpaths > no contig file generated") os.chdir("..") return sample_config else: print("ALLPATHS PrepareAllPathInputs terminated with an error. " "Please check running folder for more informations") os.chdir("..") return sample_config os.chdir("..") return sample_config
def _run_soapdenovo(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "soapdenovo" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART kmer = 54 if "kmer" in sample_config: kmer = sample_config["kmer"] threads = ["-p", "8"] # default for UPPMAX if "threads" in sample_config: threads = ["-p", "{}".format(sample_config["threads"])] soap_config_file = open("configuration.txt", "w") soap_config_file.write("max_rd_len=150\n") #TODO make this a parameter in the options rank = 1 for library, libraryInfo in sorted_libraries_by_insert: soap_config_file.write("[LIB]\n") read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] soap_config_file.write("avg_ins={}\n".format(insert)) soap_config_file.write("rank={}\n".format(rank)) rank += 1 soap_config_file.write("map_len=30\n") if orientation == "innie" or orientation == "none": soap_config_file.write("asm_flags=3\n") soap_config_file.write("pair_num_cutoff=3\n") soap_config_file.write("reverse_seq=0\n") if read2 is None: soap_config_file.write("q={}\n".format(read1)) else: soap_config_file.write("q1={}\n".format(read1)) soap_config_file.write("q2={}\n".format(read2)) elif orientation == "outtie": soap_config_file.write("asm_flags=2\n") soap_config_file.write("pair_num_cutoff=5\n") soap_config_file.write("reverse_seq=1\n") soap_config_file.write("q1={}\n".format(read1)) soap_config_file.write("q2={}\n".format(read2)) soap_config_file.close() assembler_stdOut = open("soap.stdOut", "w") assembler_stdErr = open("soap.stdErr", "w") os.makedirs(os.path.join(assemblyDirectory, "runSOAP")) os.chdir("runSOAP") #TODO : lots of missing options command = [ programBIN, "all", "-s", "{}".format(os.path.join(assemblyDirectory, "configuration.txt")), "-K", "{}".format(kmer), "-L", "500", "-o", "soapAssembly", threads[0], threads[1] ] common.print_command(command) returnValue = 0 if not common.check_dryrun(sample_config): subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) else: os.chdir("..") os.chdir("..") return sample_config os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0: if (os.path.exists(os.path.join("runSOAP", "soapAssembly.scafSeq"))): subprocess.call([ "cp", os.path.join("runSOAP", "soapAssembly.scafSeq"), "{}.scf.fasta".format(outputName) ]) subprocess.call([ "cp", os.path.join("runSOAP", "soapAssembly.contig"), "{}.ctg.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runSOAP"]) else: print( "something wrong with SOAPdenovo -> no contig file generated") else: print("SOAPdenovo terminated with an error. Please check running", "folder for more informations") os.chdir("..") return sample_config os.chdir("..") return sample_config