def run(global_config, sample_config): sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) sample_config["commands"] = "" if "tools" in sample_config: """If so, execute them one after the other in the specified order (might not work)""" for command in sample_config["tools"]: """with this I pick up at run time the correct function in the current module""" command_fn = getattr(sys.modules[__name__], "_run_{}".format(command)) """Update sample config, each command return sample_config and if necessary it modifies it""" sample_config = command_fn(global_config, sample_config, sorted_libraries_by_insert) else: #run default pipeline for QC sample_config = _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert) sample_config = _run_fastqc(global_config, sample_config, sorted_libraries_by_insert) sample_config = _run_abyss(global_config, sample_config, sorted_libraries_by_insert) with open("{}.nougat".format(sample_config.get("output", "sample")), "w") as f: yaml.dump(sample_config, f)
def run(global_config, sample_config): sorted_libraries_by_insert = \ common._sort_libraries_by_insert(sample_config) _check_libraries(sorted_libraries_by_insert) computeAssemblyStats(sample_config) # filter out short contigs sample_config = _build_new_reference(sample_config) if "tools" in sample_config: """If so, execute them one after the other in the specified order \ (might not work)""" for command in sample_config["tools"]: """with this I pick up at run time the correct function in the \ current module""" command_fn = getattr(sys.modules[__name__], "_run_{}".format(command)) """Update sample config, each command return sample_config and \ if necessary it modifies it""" sample_config = command_fn(global_config, sample_config, sorted_libraries_by_insert) else: #run default pipeline for QC sample_config = _run_align(global_config, sample_config, sorted_libraries_by_insert) sample_config = _run_qaTools(global_config, sample_config, sorted_libraries_by_insert) sample_config = _run_FRC(global_config, sample_config, sorted_libraries_by_insert)
def run(global_config, sample_config): sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) #Check if the user has specified tools, if not select default list of tools if "tools" not in sample_config or len(sample_config["tools"]) == 0: sample_config["tools"] = ["soapdenovo"] #Execute the commands now for command in sample_config["tools"]: command_fn = getattr( sys.modules[__name__] , "_run_{}".format(command)) sample_config = command_fn(global_config, sample_config, sorted_libraries_by_insert)
def run(global_config, sample_config): sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) #Check if the user has specified tools, if not select default list of tools if "tools" not in sample_config or len(sample_config["tools"]) == 0: sample_config["tools"] = ["soapdenovo"] #Execute the commands now for command in sample_config["tools"]: command_fn = getattr(sys.modules[__name__], "_run_{}".format(command)) sample_config = command_fn(global_config, sample_config, sorted_libraries_by_insert)
def _run_spades(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "spades" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART command = "" command += "{} ".format(programBIN) for option in program_options: command += "{} ".format(option) #creates the command on-the-fly peLibrary = 1 mpLibrary = 1 for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation=="innie" or orientation=="none": if read2 is None: command += "--pe{}-s {} ".format(peLibrary, read1) else: command += "--pe{}-1 {} --pe{}-2 {} ".format(peLibrary, read1, peLibrary, read2) peLibrary += 1 elif orientation=="outtie": command += "--mp{}-1 {} --mp{}-2 {} ".format(mpLibrary, read1, mpLibrary, read2) mpLibrary += 1 else: print("orientation{} not supported.... why the program did not", "failed earlier?".format(orientation)) command += "-o {} ".format(outputName) common.print_command(command) returnValue = 0 if not common.check_dryrun(sample_config): assembler_stdOut = open("spades.stdOut", "a") assembler_stdErr = open("spades.stdErr", "a") returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr, shell=True) else: return sample_config flags = sample_config.get("flags", []) if returnValue == 0: if os.path.exists(os.path.join(outputName,"contigs.fasta")): subprocess.call(["cp", os.path.join(outputName,"contigs.fasta"), "{}.ctg.fasta".format(outputName)]) subprocess.call(["cp", os.path.join(outputName,"scaffolds.fasta"), "{}.scf.fasta".format(outputName)]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", outputName]) else: print("something wrong with SPADES -> no contig file generated") else: print("SPADES terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def _run_soapdenovo(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "soapdenovo" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART kmer = 54 if "kmer" in sample_config: kmer = sample_config["kmer"] threads = ["-p", "8"] # default for UPPMAX if "threads" in sample_config: threads = ["-p", "{}".format(sample_config["threads"])] soap_config_file = open("configuration.txt", "w") soap_config_file.write("max_rd_len=150\n") #TODO make this a parameter in the options rank = 1 for library, libraryInfo in sorted_libraries_by_insert: soap_config_file.write("[LIB]\n") read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] soap_config_file.write("avg_ins={}\n".format(insert)) soap_config_file.write("rank={}\n".format(rank)) rank += 1 soap_config_file.write("map_len=30\n") if orientation=="innie" or orientation=="none": soap_config_file.write("asm_flags=3\n") soap_config_file.write("pair_num_cutoff=3\n") soap_config_file.write("reverse_seq=0\n") if read2 is None: soap_config_file.write("q={}\n".format(read1)) else: soap_config_file.write("q1={}\n".format(read1)) soap_config_file.write("q2={}\n".format(read2)) elif orientation=="outtie": soap_config_file.write("asm_flags=2\n") soap_config_file.write("pair_num_cutoff=5\n") soap_config_file.write("reverse_seq=1\n") soap_config_file.write("q1={}\n".format(read1)) soap_config_file.write("q2={}\n".format(read2)) soap_config_file.close() assembler_stdOut = open("soap.stdOut", "w") assembler_stdErr = open("soap.stdErr", "w") os.makedirs(os.path.join(assemblyDirectory, "runSOAP")) os.chdir("runSOAP") #TODO : lots of missing options command = [programBIN , "all", "-s", "{}".format(os.path.join(assemblyDirectory, "configuration.txt")), "-K", "{}".format(kmer), "-L", "500", "-o", "soapAssembly", threads[0], threads[1] ] common.print_command(command) returnValue = 0 if not common.check_dryrun(sample_config): subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) else: os.chdir("..") os.chdir("..") return sample_config os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0: if(os.path.exists(os.path.join("runSOAP","soapAssembly.scafSeq"))): subprocess.call(["cp", os.path.join("runSOAP", "soapAssembly.scafSeq"), "{}.scf.fasta".format(outputName)]) subprocess.call(["cp", os.path.join("runSOAP", "soapAssembly.contig"), "{}.ctg.fasta".format(outputName)]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runSOAP"]) else: print("something wrong with SOAPdenovo -> no contig file generated") else: print("SOAPdenovo terminated with an error. Please check running", "folder for more informations") os.chdir("..") return sample_config os.chdir("..") return sample_config
def _run_masurca(global_config, sample_config,sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "masurca" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART masurca_config_file = open("configuration.txt", "w") masurca_config_file.write("DATA\n") allTheLetters = string.lowercase libraryPE = "p" libraryPEnum = 0 libraryMP = "m" libraryMPnum = 0 #TODO: single ended reads for library, libraryInfo in sorted_libraries_by_insert: read1=libraryInfo["pair1"] read2=libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation=="innie": if read2 is not None: configurationLine = "PE = {}{} {} {} {} {}".format(libraryPE, allTheLetters[libraryPEnum], insert, std, read1, read2) masurca_config_file.write("{}\n".format(configurationLine)) libraryPEnum += 1 #TODO: check when more than 21 PE libraries ae specified elif orientation=="outtie": configurationLine = "JUMP = {}{} {} {} {} {}".format(libraryMP, allTheLetters[libraryMPnum], insert, std, read1, read2) masurca_config_file.write("{}\n".format(configurationLine)) libraryMPnum += 1 #TODO: check when more than 21 PE libraries ae specified masurca_config_file.write("END\n") masurca_config_file.write("\n") masurca_config_file.write("PARAMETERS\n") #this is k-mer size for deBruijn graph values between 25 and 101 are #supported, auto will compute the optimal size based on the read data #and GC content masurca_config_file.write("GRAPH_KMER_SIZE=auto\n") #set this to 1 for Illumina-only assemblies and to 0 if you have 2x or #more long (Sanger, 454) reads masurca_config_file.write("USE_LINKING_MATES=1\n") #this parameter is useful if you have too many jumping library mates. #See manual for explanation about settings based on genome length if sample_config["genomeSize"] > 10000000: masurca_config_file.write("LIMIT_JUMP_COVERAGE = 1000\n") else: masurca_config_file.write("LIMIT_JUMP_COVERAGE = 60\n") #these are the additional parameters to Celera Assembler. do not worry #about performance, number or processors or batch sizes -- these are #computed automatically. for mammals do not set cgwErrorRate above 0.15!!! if sample_config["genomeSize"] > 1500000000: masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \ cgwErrorRate=0.15 ovlMemory=4GB\n") else: masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \ cgwErrorRate=0.25 ovlMemory=4GB\n") #auto-detected number of cpus to use threads = 8 # default for UPPMAX if "threads" in sample_config : threads = sample_config["threads"] masurca_config_file.write("NUM_THREADS= {}\n".format(threads)) #this is mandatory jellyfish hash size ---- jellyfish hash size, #set this to about 10x the genome size. JF_SIZE = sample_config["genomeSize"] * 11 masurca_config_file.write("JF_SIZE={}\n".format(JF_SIZE)) #this specifies if we do (1) or do not (0) want to trim long runs of #homopolymers (e.g. GGGGGGGG) from 3' read ends, use it for high GC genomes masurca_config_file.write("DO_HOMOPOLYMER_TRIM=0\n") masurca_config_file.write("END\n") masurca_config_file.write("\n") masurca_config_file.close() if common.check_dryrun(sample_config): os.chdir("..") return sample_config masurca_stdOut = open("masurca.stdOut", "w") masurca_stdErr = open("masurca.stdErr", "w") os.mkdir("runMASURCA") os.chdir("runMASURCA") command = [os.path.join(programBIN,"bin/masurca") , "../configuration.txt"] common.print_command(command) subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr) if not os.path.exists("assemble.sh"): print("MaSuRCA: assemble.sh not created. Unknown failure") return sample_config command = ["./assemble.sh"] common.print_command(command) returnValue = subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr) os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0: if os.path.exists(os.path.join( "runMASURCA","CA/10-gapclose/genome.scf.fasta")): subprocess.call(["cp", os.path.join( "runMASURCA","CA/10-gapclose/genome.ctg.fasta"), "{}.ctg.fasta".format(outputName) ]) subprocess.call(["cp", os.path.join( "runMASURCA","CA/10-gapclose/genome.scf.fasta"), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runMASURCA"]) else: print("something wrong with MaSuRCA -> no contig file generated") else: print("MaSuRCA terminated with an error. Please check running folder", "for more informations") return sample_config os.chdir("..") return sample_config
def _run_cabog(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "cabog" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART sys.path.insert(0, programBIN) libraries = 1 for library, libraryInfo in sorted_libraries_by_insert: command_fastqToCA = os.path.join(programBIN, "fastqToCA") read1=libraryInfo["pair1"] read2=libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] command_fastqToCA += " -libraryname " command_fastqToCA += " {}_{}".format(outputName, libraries) command_fastqToCA += " -insertsize " command_fastqToCA += " {} {} ".format(insert,std) command_fastqToCA += " -technology " command_fastqToCA += " illumina " command_fastqToCA += " -type " command_fastqToCA += " illumina " if orientation=="innie" or orientation=="none" : command_fastqToCA += " -innie " if read2 is None: command_fastqToCA += " -reads " command_fastqToCA += " {} ".format(read1) else: command_fastqToCA += " -mates " command_fastqToCA += " {},{} ".format(read1, read2) elif orientation=="outtie": command_fastqToCA += " -outtie " command_fastqToCA += " -mates " command_fastqToCA += " {},{} ".format(read1, read2) command_fastqToCA += " > " command_fastqToCA += " {}_{}.frg ".format(outputName, libraries) common.print_command(command_fastqToCA) if not common.check_dryrun(sample_config): cabog_stdOut = open("cabog_fastqToCA.stdOut", "w") cabog_stdErr = open("cabogfastqToCA.stdErr", "w") subprocess.call(command_fastqToCA, stderr=cabog_stdErr, shell=True) cabog_stdOut.close() cabog_stdErr.close() libraries += 1 command_runCA = os.path.join(programBIN, "runCA") command_runCA += " -d runCABOGfolder -p {} *frg".format(outputName) common.print_command(command_runCA) if common.check_dryrun(sample_config): return sample_config returnValue = 0 cabog_stdOut = open("cabog_runCA.stdOut", "w") cabog_stdErr = open("cabog_runCA.stdErr", "w") returnValue = subprocess.call(command_runCA, stdout=cabog_stdOut, stderr=cabog_stdErr, shell=True) flags = sample_config.get("flags", []) if returnValue == 0: #assembly succed, remove files and save assembly if os.path.exists(os.path.join("runCABOGfolder","9-terminator", "{}.ctg.fasta".format(outputName))): subprocess.call(["cp", os.path.join("runCABOGfolder","9-terminator", "{}.ctg.fasta".format(outputName)), "{}.ctg.fasta".format(outputName)]) subprocess.call(["cp", os.path.join("runCABOGfolder","9-terminator", "{}.scf.fasta".format(outputName)), "{}.scf.fasta".format(outputName)]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runCABOGfolder"]) else: print("something wrong with CABOG -> no contig file generated") else: print("CABOG terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "abyss" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in abyss case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure("abyss", assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART assembler_stdOut = open("abyss.stdOut", "a") assembler_stdErr = open("abyss.stdErr", "a") program = os.path.join(programBIN, "abyss-pe") command = "" command += "{} ".format(program) threads = 8 # default for UPPMAX if "threads" in sample_config: threads = sample_config["threads"] command += "np={} ".format(threads) kmer = 54 if "kmer" in sample_config: kmer = sample_config["kmer"] command += "k={} ".format(kmer) libraries = {} for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation == "innie" or orientation == "none": if read2 is None: # check if this is the first time I insert a se file if "se" not in libraries: libraries["se"] = "se=\'" libraries["se"] = libraries["se"] + read1 else: if not "lib" in libraries: libraries["lib"] = {} libName = insert # lib name is the insert size if not libName in libraries["lib"]: libraries["lib"][libName] = "" libraries["lib"][libName] += "{} {} ".format(read1, read2) else: if not "mp" in libraries: libraries["mp"] = {} libName = format(insert) if not libName in libraries["mp"]: libraries["mp"][libName] = "" libraries["mp"][libName] += "{} {} ".format(read1, read2) #now create the command command += "name={} ".format(outputName) librariesSE = "" librariesPE = "" librariesMP = "" if "se" in libraries: libraries["se"] = libraries["se"] + "\'" librariesSE = libraries["se"] if "lib" in libraries: lib = "lib=\'" for libPE, libPEreads in sorted(libraries["lib"].items()): lib = lib + "lib{} ".format(libPE) librariesPE += " lib{}=\'{}\' ".format(libPE, libPEreads) lib = lib + "\' " command += "{} ".format(lib) if "mp" in libraries: mp = "mp=\'" for libMP, libMPreads in sorted(libraries["mp"].items()): mp = mp + "lib{} ".format(libMP) librariesMP += " lib{}=\'{}\' ".format(libMP, libMPreads) mp = mp + "\' " command += "{} ".format(mp) command += "{} ".format(librariesSE) command += "{} ".format(librariesPE) command += "{} ".format(librariesMP) common.print_command(command) if common.check_dryrun(sample_config): os.chdir("..") return sample_config os.makedirs(os.path.join(assemblyDirectory, "runABySS")) os.chdir("runABySS") returnValue = 0 returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr, shell=True) os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0 and not common.check_dryrun(sample_config): if os.path.exists( os.path.join("runABySS", "{}-contigs.fa".format(outputName))): subprocess.call([ "cp", os.path.join("runABySS", "{}-contigs.fa".format(outputName)), "{}.ctg.fasta".format(outputName) ]) subprocess.call([ "cp", os.path.join("runABySS", "{}-scaffolds.fa".format(outputName)), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runABySS"]) elif not common.check_dryrun(sample_config): print("something wrong with ABySS -> no contig file generated") return sample_config else: print("ABySS terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def _run_qc_report(global_config, sample_config): """This function produces a pdf report and stores the important \ resutls in a single folder""" sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) ### retrive all info needed to write the report sampleName = "sample" if "output" in sample_config: sampleName = sample_config["output"] projectName = "anonymous_project" if "projectName" in sample_config: projectName = sample_config["projectName"] currentDir = os.getcwd() workingDir = os.path.join(currentDir, "results") if not os.path.exists(workingDir): os.makedirs(workingDir) os.chdir(workingDir) reportDir = os.path.join(workingDir, "report") if not os.path.exists(reportDir): os.makedirs(reportDir) PDFtitle = os.path.join(workingDir, "report", "{}.pdf".format(sample_config["output"])) # this you cannot do in rLab which is why I wrote the helper initially TABLE_WIDTH = 540 class MyTheme(DefaultTheme): doc = { 'leftMargin': 25, 'rightMargin': 25, 'topMargin': 20, 'bottomMargin': 25, 'allowSplitting': False } # let's create the doc and specify title and author doc = pdf.Pdf('{} {}'.format(projectName, sampleName), 'NGI-Stockholm, Science for Life Laboratory') # now we apply our theme doc.set_theme(MyTheme) # give me some space doc.add_spacer() # this header defaults to H1 scriptDirectory = os.path.split(os.path.abspath(__file__))[0] logo_path = os.path.join(scriptDirectory, '../pictures/ngi_scilife.png') doc.add_image(logo_path, 540, 50, pdf.CENTER) # give me some space doc.add_spacer() doc.add_header('NGI-Stockholm -- Science For Life Laboratory') doc.add_header('Best-practice analysis for quality checking report') doc.add_header('{} -- {}'.format(projectName, sampleName)) # give me some space doc.add_spacer() doc.add_paragraph("For sample {} belonging to the project {} " "NGI-Stockholm best-practice analysis for quality checking has " "been performed. For mate pair libraries produced with Nextera, " "best-practice analysis described at this address has been " "performed: http://res.illumina.com/documents/products/technotes/" "technote_nextera_matepair_data_processing.pdf".format(sampleName, projectName)) doc.add_spacer() tools = ["trimmomatic", "fastqc", "abyss", "align", "kmergenie"] if "tools" in sample_config and len(sample_config["tools"]) > 0: tools = sample_config["tools"] doc.add_paragraph("The following tools have been employed \ (tools are listed in order of execution):") bollet_list = [] for tool in tools: if tool != "align": program_path = global_config["Tools"][tool]["bin"] bollet_list.append("{} : {}".format(tool, program_path)) else: bollet_list.append("{} : {}".format(tool, \ global_config["Tools"]["bwa"]["bin"])) bollet_list.append("{} : {}".format(tool, \ global_config["Tools"]["samtools"]["bin"])) bollet_list.append("{} : {}".format(tool, \ global_config["Tools"]["picard"]["bin"])) doc.add_list(bollet_list) doc.add_spacer() doc.add_paragraph("The results from each tool is reported in the " "following sections. Moreover you will find all the results and " "commands that have been run in the delivery folder on Uppmax") for tool in tools: doc.add_pagebreak() doc.add_header(tool.title() , pdf.H2) if tool == "trimmomatic": doc.add_paragraph("Reads (both paired and mate pairs) can " "contain parts of the adapter sequence or, in the case of " "mate pairs, part of the linker sequence. Illumina " "recommends to remove the adapter before use of the reads " "in any downstream analysis (this is mandatory for mate " "pairs).") doc.add_paragraph("Adapter sequences removed are:") adapter_file = sample_config["adapters"] adapters = [] with open(adapter_file) as afile: lines = afile.readlines() for index in range(1, len(lines), 2): adapters.append(lines[index].rstrip()) doc.add_list(adapters) doc.add_spacer() trimmomatic_table_part1 = [[sampleName, "#orig_pairs", "#survived_pairs"]] # this is the header row trimmomatic_table_part2 = [[sampleName,"#survived_fw_only", "#survived_rv_only", "#discarded"]] total_orig_pairs = 0 total_survived_pairs = 0 total_survived_fw_only = 0 total_survived_rv_only = 0 total_discarded = 0 for library, libraryInfo in sorted_libraries_by_insert: runName = os.path.basename(libraryInfo["trimmomatic"]).split( "_1_trimmomatic.stdErr")[0] with open(libraryInfo["trimmomatic"]) as trimmomatic_output: lines = trimmomatic_output.readlines() result_line = lines[-2].rstrip() match_string = re.compile("Input Read Pairs: (\d+) Both " "Surviving: (\d+) \(.+\) Forward Only Surviving: " "(\d+) \(.+\) Reverse Only Surviving: (\d+) \(.+\) " "Dropped: (\d+) \(.+\)") read_pairs = int(match_string.match(result_line).group(1)) survived_pairs = int(match_string.match( result_line).group(2)) survived_fw_only = int(match_string.match( result_line).group(3)) survived_rv_only = int(match_string.match( result_line).group(4)) discarded = int(match_string.match( result_line).group(5)) read_pairs_perc = "({0:.0f}%)".format( (float(survived_pairs)/read_pairs) * 100) survived_fw_only_perc = "({0:.0f}%)".format( (float(survived_fw_only)/read_pairs) * 100) survived_rv_only_perc = "({0:.0f}%)".format( (float(survived_rv_only)/read_pairs) * 100) survived_discarded_perc = "({0:.0f}%)".format( (float(discarded)/read_pairs) * 100) total_orig_pairs += read_pairs total_survived_pairs += survived_pairs total_survived_fw_only += survived_fw_only total_survived_rv_only += survived_rv_only total_discarded += discarded # these are the other rows trimmomatic_table_part1.append([runName,read_pairs, "{} {}".format(survived_pairs, read_pairs_perc)]) trimmomatic_table_part2.append([runName, "{} {}".format(survived_fw_only, survived_fw_only_perc), "{} {}".format(survived_rv_only, survived_rv_only_perc), "{} {}".format(discarded, survived_discarded_perc)]) survived_pairs_perc = "({0:.0f}%)".format( (float(total_survived_pairs)/total_orig_pairs) * 100) survived_survived_fw_only_perc = "({0:.0f}%)".format( (float(total_survived_fw_only)/total_orig_pairs) * 100) survived_survived_rv_only_perc = "({0:.0f}%)".format( (float(total_survived_rv_only)/total_orig_pairs) * 100) survived_discarded_perc = "({0:.0f}%)".format( (float(total_discarded)/total_orig_pairs) * 100) trimmomatic_table_part1.append(["total", total_orig_pairs, "{} {}".format(total_survived_pairs, survived_pairs_perc)]) # last row is the sum trimmomatic_table_part2.append(["total", "{} {}".format( survived_fw_only, survived_fw_only_perc), "{} {}".format(survived_rv_only, survived_rv_only_perc), "{} {}".format(discarded, survived_discarded_perc)]) doc.add_table(trimmomatic_table_part1, TABLE_WIDTH) doc.add_spacer() doc.add_table(trimmomatic_table_part2, TABLE_WIDTH) ##now save the trimmed reads trimmomaticDir = os.path.split(libraryInfo["trimmomatic"])[0] trimmomaticResultDir = os.path.join(workingDir, "fastq_trimmed") if not os.path.exists(trimmomaticResultDir): os.makedirs(trimmomaticResultDir) filesToCopy = [os.path.join(trimmomaticDir, f) for f in \ os.listdir(trimmomaticDir) \ if (os.path.isfile(os.path.join(trimmomaticDir,f)) \ and re.search('.gz$',f))] for source in filesToCopy: dest = os.path.join("fastq_trimmed" , os.path.split(source)[1]) if not os.path.isfile(dest): shutil.copyfile(source, dest) if tool == "fastqc" and "fastqc" in sample_config: fastqc_dir = sample_config["fastqc"] for fastqc_run in [dir for dir in os.listdir(fastqc_dir) \ if os.path.isdir(os.path.join(fastqc_dir, dir))]: fastqc_run_dir = os.path.join(fastqc_dir, fastqc_run, "Images") doc.add_image(os.path.join(fastqc_run_dir, "per_base_quality.png"), 400, 180, pdf.CENTER, "{} -- Per Base Quality".format(fastqc_run)) fastqc_run_dir = os.path.join(fastqc_dir, fastqc_run, "Images") doc.add_image(os.path.join(fastqc_run_dir, "sequence_length_distribution.png"), 400, 180, pdf.CENTER, "{} -- Sequence Length Distribution".format(fastqc_run)) #If I have not yet copied fastqc results do it if not os.path.exists("fastqc"): dirsToBeCopied = [os.path.join(fastqc_dir, f) for f in \ os.listdir(fastqc_dir) \ if os.path.isdir(os.path.join(fastqc_dir, f))] for source in dirsToBeCopied: dest = os.path.join("fastqc", os.path.split(source)[1]) if not os.path.exists(dest): shutil.copytree(source, dest) if tool == "abyss" and "abyss" in sample_config: doc.add_paragraph("A possible way to assess the complexity of a " "library even in absence of a reference sequence is to " "look at the kmer profile of the reads. The idea is to " "count all the kmers (i.e., sequence of length k) that occur " "in the reads. In this way it is possible to know how many " "kmers occur 1,2,..., N times and represent this as a " "plot. This plot tell us for each x, how many k-mers " "(y-axis) are present in the dataset in exactly x-copies. " "In an ideal world (no errors in sequencing, no bias, no " "repeating regions) this plot should be as close as " "possible to a gaussian distribution. In reality we will " "always see a peak for x=1 (i.e., the errors) and another " "peak close to the expected coverage. If the genome is " "highly heterozygous a second peak at half of the coverage " "can be expected.") kmer_1_200 = os.path.join(sample_config["abyss"], "kmer_coverage.png") doc.add_image(kmer_1_200, 500, 300, pdf.CENTER, "kmer profile with k={}.".format(sample_config["kmer"])) #copy the results in resutls if not os.path.exists("kmer_analysis"): os.mkdir("kmer_analysis") kmerDir = sample_config["abyss"] filesToCopy = [os.path.join(kmerDir, f) for f in \ os.listdir(kmerDir) \ if (os.path.isfile(os.path.join(kmerDir,f)) \ and re.search('.png$',f))] filesToCopy.append(os.path.join(kmerDir, "histogram.hist")) for source in filesToCopy: dest = os.path.join("kmer_analysis", os.path.split(source)[1]) if not os.path.exists(dest): shutil.copyfile(source, dest) if tool == "align" and "alignments" in sample_config: alignments = sample_config["alignments"][0] alignment_path = alignments[1] alignment_prefix = alignments[2] align_dir = os.path.split(alignment_path)[0] doc.add_header("{} -- Collect Insert Size Metrics".format( sampleName) , pdf.H3) with open(os.path.join(align_dir, "{}.collectInsertSize.txt".format(alignment_prefix))) \ as collectInsertSize: lines = collectInsertSize.readlines() line = lines[6].rstrip().split("\t") # this is the header row insertSize_table = [[line[7], line[6], line[4], line[5]]] line = lines[7].rstrip().split("\t") # this is the header row insertSize_table.append([line[7], line[6], line[4], line[5]]) line = lines[8].rstrip().split("\t") # this is the header row insertSize_table.append([line[7], line[6], line[4], line[5]]) line = lines[9].rstrip().split("\t") # this is the header row insertSize_table.append([line[7], line[6], line[4], line[5]]) doc.add_table(insertSize_table, TABLE_WIDTH) doc.add_spacer() full_path_to_pdf = os.path.join(align_dir, "{}.collectInsertSize.pdf".format(alignment_prefix)) doc.add_paragraph("Insert size plot can be found in the result \ directory: {}".format(os.path.join("alignments", "{}.collectInsertSize.pdf".format(alignment_prefix)))) doc.add_spacer() doc.add_header("{} -- Duplicate Metrics".format(sampleName), pdf.H3) with open(os.path.join(align_dir, "{}.markDuplicates.txt".format(alignment_prefix))) as \ collectInsertSize: lines = collectInsertSize.readlines() line = lines[6].rstrip().split("\t") # this is the header row duplication_table_part1 = [line[0:3]] duplication_table_part2 = [line[4:6]] duplication_table_part3 = [line[7:9]] line = lines[7].rstrip().split("\t") duplication_table_part1.append(line[0:3]) duplication_table_part2.append(line[4:6]) duplication_table_part3.append(line[7:9]) doc.add_table(duplication_table_part1, TABLE_WIDTH) doc.add_spacer() doc.add_table(duplication_table_part2, TABLE_WIDTH) doc.add_spacer() doc.add_table(duplication_table_part3, TABLE_WIDTH) doc.add_spacer() full_path_to_bam = os.path.join(align_dir, "{}_noDup.bam".format(alignment_prefix)) doc.add_paragraph("Bam file with marked duplicate reads can be \ found at: {}".format(os.path.join("alignments", "{}_noDup.bam".format(alignment_prefix)))) doc.add_spacer() #copy the results in resutls if not os.path.exists("alignments"): os.mkdir("alignments") filesToCopy = [os.path.join(align_dir, f) for f in \ os.listdir(align_dir) \ if (os.path.isfile(os.path.join(align_dir,f)) \ and re.search('{}'.format(alignment_prefix),f))] for source in filesToCopy: dest = os.path.join("alignments", os.path.split(source)[1]) if not os.path.exists(dest): shutil.copyfile(source, dest) if tool == "kmergenie" and "kmergenie" in sample_config: doc.add_paragraph("Assemblers using a de Bruijn graph strategy " "for contig construction (such as Velvet, ABySS and " "SOAPdenovo) fractures the reads into k-sized substrings " "(k-mers). The k-mer size is vital for the performance of " "these assemblers, and is usually selected considering " "several trade-offs between the size and accuracy of the " "produced contigs. Some assemblers choose the k-mer size " "automatically or builds several assemblies (using " "different k-mers) and / or relies on user input. " "Kmergenie is a lightweight program that suggests a best " "k-mer size based on their relative abundance in the " "genomic reads.") kmerdir = sample_config["kmergenie"] doc.add_image(os.path.join(kmerdir,"histograms.dat.png"), 400, 300, pdf.CENTER, ("The plot should be roughly concave and have " "a clear global maximum, if not the predicted best " "k is likely to be inaccurate")) #copy everything to results dest = os.path.join(os.getcwd(), "kmergenie") if not os.path.exists(dest): shutil.copytree(kmerdir, dest) doc.render(PDFtitle) # Copy the pipeline files and commands run to the report directory filesToCopy = glob.glob(currentDir+"/{}_QCcontrol.*".format(sampleName)) for cfile in filesToCopy: shutil.copyfile(cfile, os.path.join(reportDir, os.path.basename(cfile))) with open(os.path.join(reportDir, "commands.txt"), "w") as f: f.write(sample_config.get("commands", "")) os.chdir(currentDir)
def _run_spades(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "spades" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART command = "" command += "{} ".format(programBIN) for option in program_options: command += "{} ".format(option) #creates the command on-the-fly peLibrary = 1 mpLibrary = 1 for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation == "innie" or orientation == "none": if read2 is None: command += "--pe{}-s {} ".format(peLibrary, read1) else: command += "--pe{}-1 {} --pe{}-2 {} ".format( peLibrary, read1, peLibrary, read2) peLibrary += 1 elif orientation == "outtie": command += "--mp{}-1 {} --mp{}-2 {} ".format( mpLibrary, read1, mpLibrary, read2) mpLibrary += 1 else: print("orientation{} not supported.... why the program did not", "failed earlier?".format(orientation)) command += "-o {} ".format(outputName) common.print_command(command) returnValue = 0 if not common.check_dryrun(sample_config): assembler_stdOut = open("spades.stdOut", "a") assembler_stdErr = open("spades.stdErr", "a") returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr, shell=True) else: return sample_config flags = sample_config.get("flags", []) if returnValue == 0: if os.path.exists(os.path.join(outputName, "contigs.fasta")): subprocess.call([ "cp", os.path.join(outputName, "contigs.fasta"), "{}.ctg.fasta".format(outputName) ]) subprocess.call([ "cp", os.path.join(outputName, "scaffolds.fasta"), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", outputName]) else: print("something wrong with SPADES -> no contig file generated") else: print("SPADES terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def _run_soapdenovo(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "soapdenovo" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART kmer = 54 if "kmer" in sample_config: kmer = sample_config["kmer"] threads = ["-p", "8"] # default for UPPMAX if "threads" in sample_config: threads = ["-p", "{}".format(sample_config["threads"])] soap_config_file = open("configuration.txt", "w") soap_config_file.write("max_rd_len=150\n") #TODO make this a parameter in the options rank = 1 for library, libraryInfo in sorted_libraries_by_insert: soap_config_file.write("[LIB]\n") read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] soap_config_file.write("avg_ins={}\n".format(insert)) soap_config_file.write("rank={}\n".format(rank)) rank += 1 soap_config_file.write("map_len=30\n") if orientation == "innie" or orientation == "none": soap_config_file.write("asm_flags=3\n") soap_config_file.write("pair_num_cutoff=3\n") soap_config_file.write("reverse_seq=0\n") if read2 is None: soap_config_file.write("q={}\n".format(read1)) else: soap_config_file.write("q1={}\n".format(read1)) soap_config_file.write("q2={}\n".format(read2)) elif orientation == "outtie": soap_config_file.write("asm_flags=2\n") soap_config_file.write("pair_num_cutoff=5\n") soap_config_file.write("reverse_seq=1\n") soap_config_file.write("q1={}\n".format(read1)) soap_config_file.write("q2={}\n".format(read2)) soap_config_file.close() assembler_stdOut = open("soap.stdOut", "w") assembler_stdErr = open("soap.stdErr", "w") os.makedirs(os.path.join(assemblyDirectory, "runSOAP")) os.chdir("runSOAP") #TODO : lots of missing options command = [ programBIN, "all", "-s", "{}".format(os.path.join(assemblyDirectory, "configuration.txt")), "-K", "{}".format(kmer), "-L", "500", "-o", "soapAssembly", threads[0], threads[1] ] common.print_command(command) returnValue = 0 if not common.check_dryrun(sample_config): subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) else: os.chdir("..") os.chdir("..") return sample_config os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0: if (os.path.exists(os.path.join("runSOAP", "soapAssembly.scafSeq"))): subprocess.call([ "cp", os.path.join("runSOAP", "soapAssembly.scafSeq"), "{}.scf.fasta".format(outputName) ]) subprocess.call([ "cp", os.path.join("runSOAP", "soapAssembly.contig"), "{}.ctg.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runSOAP"]) else: print( "something wrong with SOAPdenovo -> no contig file generated") else: print("SOAPdenovo terminated with an error. Please check running", "folder for more informations") os.chdir("..") return sample_config os.chdir("..") return sample_config
def _run_masurca(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "masurca" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART masurca_config_file = open("configuration.txt", "w") masurca_config_file.write("DATA\n") allTheLetters = string.lowercase libraryPE = "p" libraryPEnum = 0 libraryMP = "m" libraryMPnum = 0 #TODO: single ended reads for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation == "innie": if read2 is not None: configurationLine = "PE = {}{} {} {} {} {}".format( libraryPE, allTheLetters[libraryPEnum], insert, std, read1, read2) masurca_config_file.write("{}\n".format(configurationLine)) libraryPEnum += 1 #TODO: check when more than 21 PE libraries ae specified elif orientation == "outtie": configurationLine = "JUMP = {}{} {} {} {} {}".format( libraryMP, allTheLetters[libraryMPnum], insert, std, read1, read2) masurca_config_file.write("{}\n".format(configurationLine)) libraryMPnum += 1 #TODO: check when more than 21 PE libraries ae specified masurca_config_file.write("END\n") masurca_config_file.write("\n") masurca_config_file.write("PARAMETERS\n") #this is k-mer size for deBruijn graph values between 25 and 101 are #supported, auto will compute the optimal size based on the read data #and GC content masurca_config_file.write("GRAPH_KMER_SIZE=auto\n") #set this to 1 for Illumina-only assemblies and to 0 if you have 2x or #more long (Sanger, 454) reads masurca_config_file.write("USE_LINKING_MATES=1\n") #this parameter is useful if you have too many jumping library mates. #See manual for explanation about settings based on genome length if sample_config["genomeSize"] > 10000000: masurca_config_file.write("LIMIT_JUMP_COVERAGE = 1000\n") else: masurca_config_file.write("LIMIT_JUMP_COVERAGE = 60\n") #these are the additional parameters to Celera Assembler. do not worry #about performance, number or processors or batch sizes -- these are #computed automatically. for mammals do not set cgwErrorRate above 0.15!!! if sample_config["genomeSize"] > 1500000000: masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \ cgwErrorRate=0.15 ovlMemory=4GB\n") else: masurca_config_file.write("CA_PARAMETERS = ovlMerSize=30 \ cgwErrorRate=0.25 ovlMemory=4GB\n") #auto-detected number of cpus to use threads = 8 # default for UPPMAX if "threads" in sample_config: threads = sample_config["threads"] masurca_config_file.write("NUM_THREADS= {}\n".format(threads)) #this is mandatory jellyfish hash size ---- jellyfish hash size, #set this to about 10x the genome size. JF_SIZE = sample_config["genomeSize"] * 11 masurca_config_file.write("JF_SIZE={}\n".format(JF_SIZE)) #this specifies if we do (1) or do not (0) want to trim long runs of #homopolymers (e.g. GGGGGGGG) from 3' read ends, use it for high GC genomes masurca_config_file.write("DO_HOMOPOLYMER_TRIM=0\n") masurca_config_file.write("END\n") masurca_config_file.write("\n") masurca_config_file.close() if common.check_dryrun(sample_config): os.chdir("..") return sample_config masurca_stdOut = open("masurca.stdOut", "w") masurca_stdErr = open("masurca.stdErr", "w") os.mkdir("runMASURCA") os.chdir("runMASURCA") command = [os.path.join(programBIN, "bin/masurca"), "../configuration.txt"] common.print_command(command) subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr) if not os.path.exists("assemble.sh"): print("MaSuRCA: assemble.sh not created. Unknown failure") return sample_config command = ["./assemble.sh"] common.print_command(command) returnValue = subprocess.call(command, stdout=masurca_stdOut, stderr=masurca_stdErr) os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0: if os.path.exists( os.path.join("runMASURCA", "CA/10-gapclose/genome.scf.fasta")): subprocess.call([ "cp", os.path.join("runMASURCA", "CA/10-gapclose/genome.ctg.fasta"), "{}.ctg.fasta".format(outputName) ]) subprocess.call([ "cp", os.path.join("runMASURCA", "CA/10-gapclose/genome.scf.fasta"), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runMASURCA"]) else: print("something wrong with MaSuRCA -> no contig file generated") else: print("MaSuRCA terminated with an error. Please check running folder", "for more informations") return sample_config os.chdir("..") return sample_config
def _run_cabog(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "cabog" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in cabog case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure(assembler, assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART sys.path.insert(0, programBIN) libraries = 1 for library, libraryInfo in sorted_libraries_by_insert: command_fastqToCA = os.path.join(programBIN, "fastqToCA") read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] command_fastqToCA += " -libraryname " command_fastqToCA += " {}_{}".format(outputName, libraries) command_fastqToCA += " -insertsize " command_fastqToCA += " {} {} ".format(insert, std) command_fastqToCA += " -technology " command_fastqToCA += " illumina " command_fastqToCA += " -type " command_fastqToCA += " illumina " if orientation == "innie" or orientation == "none": command_fastqToCA += " -innie " if read2 is None: command_fastqToCA += " -reads " command_fastqToCA += " {} ".format(read1) else: command_fastqToCA += " -mates " command_fastqToCA += " {},{} ".format(read1, read2) elif orientation == "outtie": command_fastqToCA += " -outtie " command_fastqToCA += " -mates " command_fastqToCA += " {},{} ".format(read1, read2) command_fastqToCA += " > " command_fastqToCA += " {}_{}.frg ".format(outputName, libraries) common.print_command(command_fastqToCA) if not common.check_dryrun(sample_config): cabog_stdOut = open("cabog_fastqToCA.stdOut", "w") cabog_stdErr = open("cabogfastqToCA.stdErr", "w") subprocess.call(command_fastqToCA, stderr=cabog_stdErr, shell=True) cabog_stdOut.close() cabog_stdErr.close() libraries += 1 command_runCA = os.path.join(programBIN, "runCA") command_runCA += " -d runCABOGfolder -p {} *frg".format(outputName) common.print_command(command_runCA) if common.check_dryrun(sample_config): return sample_config returnValue = 0 cabog_stdOut = open("cabog_runCA.stdOut", "w") cabog_stdErr = open("cabog_runCA.stdErr", "w") returnValue = subprocess.call(command_runCA, stdout=cabog_stdOut, stderr=cabog_stdErr, shell=True) flags = sample_config.get("flags", []) if returnValue == 0: #assembly succed, remove files and save assembly if os.path.exists( os.path.join("runCABOGfolder", "9-terminator", "{}.ctg.fasta".format(outputName))): subprocess.call([ "cp", os.path.join("runCABOGfolder", "9-terminator", "{}.ctg.fasta".format(outputName)), "{}.ctg.fasta".format(outputName) ]) subprocess.call([ "cp", os.path.join("runCABOGfolder", "9-terminator", "{}.scf.fasta".format(outputName)), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runCABOGfolder"]) else: print("something wrong with CABOG -> no contig file generated") else: print("CABOG terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def _run_allpaths(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "allpaths" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in abyss case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure("allpaths", assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config inGroups_file = open("in_groups.csv", "w") inLibs_file = open("in_libs.csv", "w") inGroups_file.write("group_name, library_name, file_name\n") inLibs_file.write("library_name, project_name, organism_name, type, " "paired, frag_size, frag_stddev, insert_size, insert_stddev, " "read_orientation,genomic_start, genomic_end\n") librariesForInLibs = [] librariesForInLibsDict = {} group_name = 1; for library, libraryInfo in sorted_libraries_by_insert: read1 =libraryInfo["pair1"] read2 =libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation=="innie": path, fqfile=os.path.split(read1) if "_1.fastq" in fqfile: fqfile = fqfile.replace("_1.fastq", "_?.fastq") elif "_R1_" in fqfile: fqfile = fqfile.replace("_R1_", "_R?_") else: print("error file format not supported {}".format(fqfile)) return sample_config inGroups_file.write("PE{}, lib{}, {}\n".format(group_name, insert, os.path.join(path, fqfile))) group_name += 1 if insert not in librariesForInLibsDict: librariesForInLibsDict[insert] = insert librariesForInLibs.append("lib{}, genome, genome, fragment, 1, " "{}, {}, , , inward, 0, 0\n".format(insert,insert, std)) elif orientation=="outtie": path, fqfile = os.path.split(read1) if "_1.fastq" in fqfile: fqfile = fqfile.replace("_1.fastq", "_?.fastq") elif "_R1_" in fqfile: fqfile = fqfile.replace("_R1_", "_R?_") else: print("error file format not supported {}".format(file)) return sample_config inGroups_file.write("MP{}, lib{}, {}\n".format(group_name, insert, os.path.join(path, fqfile))) group_name += 1 if insert not in librariesForInLibsDict: librariesForInLibsDict[insert] = insert librariesForInLibs.append("lib{}, genome, genome, fragment, 1, " ", , {}, {}, outward, 0, 0\n".format(insert,insert, std)) else: print("all paths support only innies and outties") inGroups_file.close() for lib in librariesForInLibs: inLibs_file.write(lib) inLibs_file.close() #NOW RUN ALLPATHS FOR REAL program=os.path.join(programBIN, "PrepareAllPathsInputs.pl") os.mkdir("data_dir") data_dir = os.path.join(assemblyDirectory, "data_dir") ploidy = "PLOIDY=1" if len(program_options) > 0: if len(program_options) >1: print("Running ALlpaths only one parameter accepted as option", "here: PLOIDY=2") return sample_config if program_options[0] == "PLOIDY=2": ploidy = "PLOIDY=2" else: print("Running ALlpaths only one parameter accepted as option", "here: PLOIDY=2") return sample_config command = [program , "DATA_DIR={}".format(data_dir), ploidy, "PICARD_TOOLS_DIR={}".format( global_config["Tools"]["picard"]["bin"]), "FORCE_PHRED=True", "PHRED_64=False", "IN_GROUPS_CSV={}".format(os.path.join(assemblyDirectory,"in_groups.csv")), "IN_LIBS_CSV={}".format(os.path.join(assemblyDirectory,"in_libs.csv"))] if common.check_dryrun(sample_config): common.print_command(command) program = os.path.join(programBIN, "RunAllPathsLG") command = [program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run"] common.print_command(command) os.chdir("..") return sample_config assembler_stdOut = open("allpaths_PrepareAllPathsInputs.stdOut", "w") assembler_stdErr = open("allpaths_PrepareAllPathsInputs.stdErr", "w") common.print_command(command) returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) assembler_stdOut.close() assembler_stdErr.close() flags = sample_config.get("flags", []) if returnValue == 0: program = os.path.join(programBIN, "RunAllPathsLG") command = [program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run", "HAPLOIDIFY=True"] common.print_command(command) assembler_stdOut = open("allpaths_RunAllPathsLG.stdOut", "w") assembler_stdErr = open("allpaths_RunAllPathsLG.stdErr", "w") returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) if returnValue != 0: print("ALLPATHS RunAllPathsLG terminated with an error. Please", "check running folder for more informations") os.chdir("..") return sample_config else: # save results assembly_dir = os.path.join("data_dir", "allpaths", "ASSEMBLIES", "run") if os.path.exists(os.path.join(assembly_dir, "final.assembly.fasta")): exit_code = subprocess.call(["cp", os.path.join(assembly_dir, "final.contigs.fasta"), "{}.ctg.fasta".format(outputName)]) exit_code += subprocess.call(["cp", os.path.join(assembly_dir, "final.assembly.fasta"), "{}.scf.fasta".format(outputName)]) if not "keep_tmp_files" in flags and exit_code == 0: subprocess.call(["rm", "-r", "data_dir"]) else: print("something wrong with Allpaths > no contig file generated") os.chdir("..") return sample_config else: print("ALLPATHS PrepareAllPathInputs terminated with an error. " "Please check running folder for more informations") os.chdir("..") return sample_config os.chdir("..") return sample_config
def _run_abyss(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "abyss" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in abyss case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) if _prepare_folder_structure("abyss", assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config ########### HERE IT START THE SPECIFIC ASSEMBLER PART assembler_stdOut = open("abyss.stdOut", "a") assembler_stdErr = open("abyss.stdErr", "a") program=os.path.join(programBIN, "abyss-pe") command = "" command += "{} ".format(program) threads = 8 # default for UPPMAX if "threads" in sample_config : threads = sample_config["threads"] command += "np={} ".format(threads) kmer = 54 if "kmer" in sample_config: kmer = sample_config["kmer"] command += "k={} ".format(kmer) libraries = {} for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation=="innie" or orientation=="none": if read2 is None: # check if this is the first time I insert a se file if "se" not in libraries: libraries["se"] = "se=\'" libraries["se"] = libraries["se"] + read1 else: if not "lib" in libraries: libraries["lib"] = {} libName = insert # lib name is the insert size if not libName in libraries["lib"]: libraries["lib"][libName] = "" libraries["lib"][libName] += "{} {} ".format(read1, read2) else: if not "mp" in libraries: libraries["mp"] = {} libName = format(insert) if not libName in libraries["mp"]: libraries["mp"][libName] = "" libraries["mp"][libName] += "{} {} ".format(read1, read2) #now create the command command += "name={} ".format(outputName) librariesSE = "" librariesPE = "" librariesMP = "" if "se" in libraries: libraries["se"] = libraries["se"] + "\'" librariesSE = libraries["se"] if "lib" in libraries: lib="lib=\'" for libPE, libPEreads in sorted(libraries["lib"].items()): lib = lib + "lib{} ".format(libPE) librariesPE += " lib{}=\'{}\' ".format(libPE,libPEreads) lib=lib + "\' " command += "{} ".format(lib) if "mp" in libraries: mp="mp=\'" for libMP, libMPreads in sorted(libraries["mp"].items()): mp = mp + "lib{} ".format(libMP) librariesMP += " lib{}=\'{}\' ".format(libMP,libMPreads) mp=mp + "\' " command += "{} ".format(mp) command += "{} ".format(librariesSE) command += "{} ".format(librariesPE) command += "{} ".format(librariesMP) common.print_command(command) if common.check_dryrun(sample_config): os.chdir("..") return sample_config os.makedirs(os.path.join(assemblyDirectory, "runABySS")) os.chdir("runABySS") returnValue = 0 returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr, shell=True) os.chdir("..") flags = sample_config.get("flags", []) if returnValue == 0 and not common.check_dryrun(sample_config): if os.path.exists(os.path.join("runABySS","{}-contigs.fa".format( outputName))): subprocess.call(["cp", os.path.join("runABySS", "{}-contigs.fa".format(outputName)), "{}.ctg.fasta".format(outputName) ]) subprocess.call(["cp", os.path.join("runABySS", "{}-scaffolds.fa".format(outputName)), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags: subprocess.call(["rm", "-r", "runABySS"]) elif not common.check_dryrun(sample_config): print("something wrong with ABySS -> no contig file generated") return sample_config else: print("ABySS terminated with an error. Please check running folder", "for more informations") os.chdir("..") return sample_config
def _run_qc_report(global_config, sample_config, delivery_folder): """This function produces a pdf report and stores the important \ resutls in a single folder""" sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) ### retrive all info needed to write the report sampleName = "sample" if "output" in sample_config: sampleName = sample_config["output"] projectName = "anonymous_project" if "projectName" in sample_config: projectName = sample_config["projectName"] currentDir = os.getcwd() workingDir = os.path.join(currentDir, sampleName) #create delivery dir for this sample sample_delivery_dir = os.path.join(delivery_folder, sampleName) if not os.path.exists(sample_delivery_dir): os.makedirs(sample_delivery_dir) reportDir = os.path.join(sample_delivery_dir, "report") if not os.path.exists(reportDir): os.makedirs(reportDir) PDFtitle = os.path.join(sample_delivery_dir, "report", "{}.pdf".format(sample_config["output"])) # this you cannot do in rLab which is why I wrote the helper initially TABLE_WIDTH = 540 class MyTheme(DefaultTheme): doc = { 'leftMargin': 25, 'rightMargin': 25, 'topMargin': 20, 'bottomMargin': 25, 'allowSplitting': False } # let's create the doc and specify title and author doc = pdf.Pdf('{} {}'.format(projectName, sampleName), 'NGI-Stockholm, Science for Life Laboratory') # now we apply our theme doc.set_theme(MyTheme) # give me some space doc.add_spacer() # this header defaults to H1 scriptDirectory = os.path.split(os.path.abspath(__file__))[0] logo_path = os.path.join(scriptDirectory, '../pictures/ngi_scilife.png') doc.add_image(logo_path, 540, 50, pdf.CENTER) # give me some space doc.add_spacer() doc.add_header('NGI-Stockholm -- Science For Life Laboratory') doc.add_header('Best-practice analysis for quality checking report') doc.add_header('{} -- {}'.format(projectName, sampleName)) # give me some space doc.add_spacer() doc.add_paragraph( "For sample {} belonging to the project {} " "NGI-Stockholm best-practice analysis for quality checking has " "been performed. For mate pair libraries produced with Nextera, " "best-practice analysis described at this address has been " "performed: http://res.illumina.com/documents/products/technotes/" "technote_nextera_matepair_data_processing.pdf".format( sampleName, projectName)) doc.add_spacer() tools = ["trimmomatic", "fastqc", "abyss", "align", "kmergenie"] if "tools" in sample_config and len(sample_config["tools"]) > 0: tools = sample_config["tools"] doc.add_paragraph("The following tools have been employed \ (tools are listed in order of execution):") bollet_list = [] for tool in tools: if tool != "align": program_path = global_config["Tools"][tool]["bin"] bollet_list.append("{} : {}".format(tool, program_path)) else: bollet_list.append("{} : {}".format(tool, \ global_config["Tools"]["bwa"]["bin"])) bollet_list.append("{} : {}".format(tool, \ global_config["Tools"]["samtools"]["bin"])) bollet_list.append("{} : {}".format(tool, \ global_config["Tools"]["picard"]["bin"])) doc.add_list(bollet_list) doc.add_spacer() doc.add_paragraph( "The results from each tool is reported in the " "following sections. Moreover you will find all the results and " "commands that have been run in the delivery folder on Uppmax") for tool in tools: doc.add_pagebreak() doc.add_header(tool.title(), pdf.H2) if tool == "trimmomatic": doc.add_paragraph( "Reads (both paired and mate pairs) can " "contain parts of the adapter sequence or, in the case of " "mate pairs, part of the linker sequence. Illumina " "recommends to remove the adapter before use of the reads " "in any downstream analysis (this is mandatory for mate " "pairs).") doc.add_paragraph("Adapter sequences removed are:") adapter_file = sample_config["adapters"] adapters = [] with open(adapter_file) as afile: lines = afile.readlines() for index in range(1, len(lines), 2): adapters.append(lines[index].rstrip()) doc.add_list(adapters) doc.add_spacer() trimmomatic_table_part1 = [[ sampleName, "#orig_pairs", "#survived_pairs" ]] # this is the header row trimmomatic_table_part2 = [[ sampleName, "#survived_fw_only", "#survived_rv_only", "#discarded" ]] total_orig_pairs = 0 total_survived_pairs = 0 total_survived_fw_only = 0 total_survived_rv_only = 0 total_discarded = 0 for library, libraryInfo in sorted_libraries_by_insert: runName = os.path.basename(libraryInfo["trimmomatic"]).split( "_1_trimmomatic.stdErr")[0] with open(libraryInfo["trimmomatic"]) as trimmomatic_output: lines = trimmomatic_output.readlines() result_line = lines[-2].rstrip() match_string = re.compile( "Input Read Pairs: (\d+) Both " "Surviving: (\d+) \(.+\) Forward Only Surviving: " "(\d+) \(.+\) Reverse Only Surviving: (\d+) \(.+\) " "Dropped: (\d+) \(.+\)") read_pairs = int(match_string.match(result_line).group(1)) survived_pairs = int( match_string.match(result_line).group(2)) survived_fw_only = int( match_string.match(result_line).group(3)) survived_rv_only = int( match_string.match(result_line).group(4)) discarded = int(match_string.match(result_line).group(5)) read_pairs_perc = "({0:.0f}%)".format( (float(survived_pairs) / read_pairs) * 100) survived_fw_only_perc = "({0:.0f}%)".format( (float(survived_fw_only) / read_pairs) * 100) survived_rv_only_perc = "({0:.0f}%)".format( (float(survived_rv_only) / read_pairs) * 100) survived_discarded_perc = "({0:.0f}%)".format( (float(discarded) / read_pairs) * 100) total_orig_pairs += read_pairs total_survived_pairs += survived_pairs total_survived_fw_only += survived_fw_only total_survived_rv_only += survived_rv_only total_discarded += discarded # these are the other rows trimmomatic_table_part1.append([ runName, read_pairs, "{} {}".format(survived_pairs, read_pairs_perc) ]) trimmomatic_table_part2.append([ runName, "{} {}".format(survived_fw_only, survived_fw_only_perc), "{} {}".format(survived_rv_only, survived_rv_only_perc), "{} {}".format(discarded, survived_discarded_perc) ]) survived_pairs_perc = "({0:.0f}%)".format( (float(total_survived_pairs) / total_orig_pairs) * 100) survived_survived_fw_only_perc = "({0:.0f}%)".format( (float(total_survived_fw_only) / total_orig_pairs) * 100) survived_survived_rv_only_perc = "({0:.0f}%)".format( (float(total_survived_rv_only) / total_orig_pairs) * 100) survived_discarded_perc = "({0:.0f}%)".format( (float(total_discarded) / total_orig_pairs) * 100) trimmomatic_table_part1.append([ "total", total_orig_pairs, "{} {}".format(total_survived_pairs, survived_pairs_perc) ]) # last row is the sum trimmomatic_table_part2.append([ "total", "{} {}".format(survived_fw_only, survived_fw_only_perc), "{} {}".format(survived_rv_only, survived_rv_only_perc), "{} {}".format(discarded, survived_discarded_perc) ]) doc.add_table(trimmomatic_table_part1, TABLE_WIDTH) doc.add_spacer() doc.add_table(trimmomatic_table_part2, TABLE_WIDTH) ##now save the trimmed reads trimmomaticDir = os.path.split(libraryInfo["trimmomatic"])[0] trimmomaticResultDir = os.path.join(sample_delivery_dir, "fastq_trimmed") if not os.path.exists(trimmomaticResultDir): os.makedirs(trimmomaticResultDir) filesToCopy = [os.path.join(trimmomaticDir, f) for f in \ os.listdir(trimmomaticDir) \ if (os.path.isfile(os.path.join(trimmomaticDir,f)) \ and re.search('.gz$',f))] for source in filesToCopy: dest = os.path.join(trimmomaticResultDir, os.path.split(source)[1]) if not os.path.isfile(dest): shutil.copyfile(source, dest) if tool == "fastqc" and "fastqc" in sample_config: fastqc_dir = sample_config["fastqc"] for fastqc_run in [dir for dir in os.listdir(fastqc_dir) \ if os.path.isdir(os.path.join(fastqc_dir, dir))]: fastqc_run_dir = os.path.join(fastqc_dir, fastqc_run, "Images") doc.add_image( os.path.join(fastqc_run_dir, "per_base_quality.png"), 400, 180, pdf.CENTER, "{} -- Per Base Quality".format(fastqc_run)) fastqc_run_dir = os.path.join(fastqc_dir, fastqc_run, "Images") doc.add_image( os.path.join(fastqc_run_dir, "sequence_length_distribution.png"), 400, 180, pdf.CENTER, "{} -- Sequence Length Distribution".format(fastqc_run)) #If I have not yet copied fastqc results do it fastqcResultDir = os.path.join(sample_delivery_dir, "fastqc") if not os.path.exists(fastqcResultDir): os.makedirs(fastqcResultDir) dirsToBeCopied = [os.path.join(fastqc_dir, f) for f in \ os.listdir(fastqc_dir) \ if os.path.isdir(os.path.join(fastqc_dir, f))] for source in dirsToBeCopied: dest = os.path.join(fastqcResultDir, os.path.split(source)[1]) if not os.path.exists(dest): shutil.copytree(source, dest) if tool == "abyss" and "abyss" in sample_config: doc.add_paragraph( "A possible way to assess the complexity of a " "library even in absence of a reference sequence is to " "look at the kmer profile of the reads. The idea is to " "count all the kmers (i.e., sequence of length k) that occur " "in the reads. In this way it is possible to know how many " "kmers occur 1,2,..., N times and represent this as a " "plot. This plot tell us for each x, how many k-mers " "(y-axis) are present in the dataset in exactly x-copies. " "In an ideal world (no errors in sequencing, no bias, no " "repeating regions) this plot should be as close as " "possible to a gaussian distribution. In reality we will " "always see a peak for x=1 (i.e., the errors) and another " "peak close to the expected coverage. If the genome is " "highly heterozygous a second peak at half of the coverage " "can be expected.") kmer_1_200 = os.path.join(sample_config["abyss"], "kmer_coverage.png") doc.add_image( kmer_1_200, 500, 300, pdf.CENTER, "kmer profile with k={}.".format(sample_config["kmer"])) #copy the results in resutls if not os.path.exists("kmer_analysis"): os.mkdir("kmer_analysis") kmerDir = sample_config["abyss"] filesToCopy = [os.path.join(kmerDir, f) for f in \ os.listdir(kmerDir) \ if (os.path.isfile(os.path.join(kmerDir,f)) \ and re.search('.png$',f))] filesToCopy.append(os.path.join(kmerDir, "histogram.hist")) abyssResultDir = os.path.join(sample_delivery_dir, "kmer_analysis") if not os.path.exists(abyssResultDir): os.makedirs(abyssResultDir) for source in filesToCopy: dest = os.path.join(abyssResultDir, os.path.split(source)[1]) if not os.path.exists(dest): shutil.copyfile(source, dest) if tool == "align" and "alignments" in sample_config: alignments = sample_config["alignments"][0] alignment_path = alignments[1] alignment_prefix = alignments[2] align_dir = os.path.split(alignment_path)[0] doc.add_header( "{} -- Collect Insert Size Metrics".format(sampleName), pdf.H3) with open(os.path.join(align_dir, "{}.collectInsertSize.txt".format(alignment_prefix))) \ as collectInsertSize: lines = collectInsertSize.readlines() line = lines[6].rstrip().split("\t") # this is the header row insertSize_table = [[line[7], line[6], line[4], line[5]]] line = lines[7].rstrip().split("\t") # this is the header row insertSize_table.append([line[7], line[6], line[4], line[5]]) line = lines[8].rstrip().split("\t") # this is the header row insertSize_table.append([line[7], line[6], line[4], line[5]]) line = lines[9].rstrip().split("\t") # this is the header row insertSize_table.append([line[7], line[6], line[4], line[5]]) doc.add_table(insertSize_table, TABLE_WIDTH) doc.add_spacer() full_path_to_pdf = os.path.join( align_dir, "{}.collectInsertSize.pdf".format(alignment_prefix)) doc.add_paragraph("Insert size plot can be found in the result \ directory: {}".format( os.path.join( "alignments", "{}.collectInsertSize.pdf".format(alignment_prefix)))) doc.add_spacer() doc.add_header("{} -- Duplicate Metrics".format(sampleName), pdf.H3) with open(os.path.join(align_dir, "{}.markDuplicates.txt".format(alignment_prefix))) as \ collectInsertSize: lines = collectInsertSize.readlines() line = lines[6].rstrip().split("\t") # this is the header row duplication_table_part1 = [line[0:3]] duplication_table_part2 = [line[4:6]] duplication_table_part3 = [line[7:9]] line = lines[7].rstrip().split("\t") duplication_table_part1.append(line[0:3]) duplication_table_part2.append(line[4:6]) duplication_table_part3.append(line[7:9]) doc.add_table(duplication_table_part1, TABLE_WIDTH) doc.add_spacer() doc.add_table(duplication_table_part2, TABLE_WIDTH) doc.add_spacer() doc.add_table(duplication_table_part3, TABLE_WIDTH) doc.add_spacer() full_path_to_bam = os.path.join( align_dir, "{}_noDup.bam".format(alignment_prefix)) doc.add_paragraph("Bam file with marked duplicate reads can be \ found at: {}".format( os.path.join("alignments", "{}_noDup.bam".format(alignment_prefix)))) doc.add_spacer() #copy the results in resutls if not os.path.exists("alignments"): os.mkdir("alignments") filesToCopy = [os.path.join(align_dir, f) for f in \ os.listdir(align_dir) \ if (os.path.isfile(os.path.join(align_dir,f)) \ and re.search('{}'.format(alignment_prefix),f))] alignmentResultDir = os.path.join(sample_delivery_dir, "alignments") if not os.path.exists(alignmentResultDir): os.makedirs(alignmentResultDir) for source in filesToCopy: dest = os.path.join(alignmentResultDir, os.path.split(source)[1]) if not os.path.exists(dest): shutil.copyfile(source, dest) if tool == "kmergenie" and "kmergenie" in sample_config: doc.add_paragraph( "Assemblers using a de Bruijn graph strategy " "for contig construction (such as Velvet, ABySS and " "SOAPdenovo) fractures the reads into k-sized substrings " "(k-mers). The k-mer size is vital for the performance of " "these assemblers, and is usually selected considering " "several trade-offs between the size and accuracy of the " "produced contigs. Some assemblers choose the k-mer size " "automatically or builds several assemblies (using " "different k-mers) and / or relies on user input. " "Kmergenie is a lightweight program that suggests a best " "k-mer size based on their relative abundance in the " "genomic reads.") kmerdir = sample_config["kmergenie"] doc.add_image(os.path.join(kmerdir, "histograms.dat.png"), 400, 300, pdf.CENTER, ("The plot should be roughly concave and have " "a clear global maximum, if not the predicted best " "k is likely to be inaccurate")) #copy everything to results kmergenieResultDir = os.path.join(sample_delivery_dir, "kmergenie") dest = kmergenieResultDir if not os.path.exists(dest): shutil.copytree(kmerdir, dest) doc.render(PDFtitle) # Copy the pipeline files and commands run to the report directory filesToCopy = glob.glob(currentDir + "/{}_QCcontrol.*".format(sampleName)) for cfile in filesToCopy: shutil.copyfile(cfile, os.path.join(reportDir, os.path.basename(cfile))) with open(os.path.join(reportDir, "commands.txt"), "w") as f: f.write(sample_config.get("commands", "")) os.chdir(currentDir)
def _run_allpaths(global_config, sample_config, sorted_libraries_by_insert): ########## ACQUIRE ALL THE INFO AND CREATE THE ASSEMBLY FOLDER assembler = "allpaths" outputName = sample_config["output"] currentDirectory = os.getcwd() assemblyDirectory = os.path.join(currentDirectory, assembler) # in abyss case there is no exectuable programBIN = global_config["Tools"][assembler]["bin"] program_options = global_config["Tools"][assembler]["options"] sorted_libraries_by_insert = common._sort_libraries_by_insert( sample_config) if _prepare_folder_structure("allpaths", assemblyDirectory) == 0: os.chdir(assemblyDirectory) else: return sample_config inGroups_file = open("in_groups.csv", "w") inLibs_file = open("in_libs.csv", "w") inGroups_file.write("group_name, library_name, file_name\n") inLibs_file.write( "library_name, project_name, organism_name, type, " "paired, frag_size, frag_stddev, insert_size, insert_stddev, " "read_orientation,genomic_start, genomic_end\n") librariesForInLibs = [] librariesForInLibsDict = {} group_name = 1 for library, libraryInfo in sorted_libraries_by_insert: read1 = libraryInfo["pair1"] read2 = libraryInfo["pair2"] orientation = libraryInfo["orientation"] insert = libraryInfo["insert"] std = libraryInfo["std"] if orientation == "innie": path, fqfile = os.path.split(read1) if "_1.fastq" in fqfile: fqfile = fqfile.replace("_1.fastq", "_?.fastq") elif "_R1_" in fqfile: fqfile = fqfile.replace("_R1_", "_R?_") else: print("error file format not supported {}".format(fqfile)) return sample_config inGroups_file.write("PE{}, lib{}, {}\n".format( group_name, insert, os.path.join(path, fqfile))) group_name += 1 if insert not in librariesForInLibsDict: librariesForInLibsDict[insert] = insert librariesForInLibs.append( "lib{}, genome, genome, fragment, 1, " "{}, {}, , , inward, 0, 0\n".format(insert, insert, std)) elif orientation == "outtie": path, fqfile = os.path.split(read1) if "_1.fastq" in fqfile: fqfile = fqfile.replace("_1.fastq", "_?.fastq") elif "_R1_" in fqfile: fqfile = fqfile.replace("_R1_", "_R?_") else: print("error file format not supported {}".format(file)) return sample_config inGroups_file.write("MP{}, lib{}, {}\n".format( group_name, insert, os.path.join(path, fqfile))) group_name += 1 if insert not in librariesForInLibsDict: librariesForInLibsDict[insert] = insert librariesForInLibs.append( "lib{}, genome, genome, fragment, 1, " ", , {}, {}, outward, 0, 0\n".format(insert, insert, std)) else: print("all paths support only innies and outties") inGroups_file.close() for lib in librariesForInLibs: inLibs_file.write(lib) inLibs_file.close() #NOW RUN ALLPATHS FOR REAL program = os.path.join(programBIN, "PrepareAllPathsInputs.pl") os.mkdir("data_dir") data_dir = os.path.join(assemblyDirectory, "data_dir") ploidy = "PLOIDY=1" if len(program_options) > 0: if len(program_options) > 1: print("Running ALlpaths only one parameter accepted as option", "here: PLOIDY=2") return sample_config if program_options[0] == "PLOIDY=2": ploidy = "PLOIDY=2" else: print("Running ALlpaths only one parameter accepted as option", "here: PLOIDY=2") return sample_config command = [ program, "DATA_DIR={}".format(data_dir), ploidy, "PICARD_TOOLS_DIR={}".format(global_config["Tools"]["picard"]["bin"]), "FORCE_PHRED=True", "PHRED_64=False", "IN_GROUPS_CSV={}".format( os.path.join(assemblyDirectory, "in_groups.csv")), "IN_LIBS_CSV={}".format(os.path.join(assemblyDirectory, "in_libs.csv")) ] if common.check_dryrun(sample_config): common.print_command(command) program = os.path.join(programBIN, "RunAllPathsLG") command = [ program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run" ] common.print_command(command) os.chdir("..") return sample_config assembler_stdOut = open("allpaths_PrepareAllPathsInputs.stdOut", "w") assembler_stdErr = open("allpaths_PrepareAllPathsInputs.stdErr", "w") common.print_command(command) returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) assembler_stdOut.close() assembler_stdErr.close() flags = sample_config.get("flags", []) if returnValue == 0: program = os.path.join(programBIN, "RunAllPathsLG") command = [ program, "PRE={}".format(assemblyDirectory), "REFERENCE_NAME=.", "DATA_SUBDIR=data_dir", "RUN=allpaths", "SUBDIR=run", "HAPLOIDIFY=True" ] common.print_command(command) assembler_stdOut = open("allpaths_RunAllPathsLG.stdOut", "w") assembler_stdErr = open("allpaths_RunAllPathsLG.stdErr", "w") returnValue = subprocess.call(command, stdout=assembler_stdOut, stderr=assembler_stdErr) if returnValue != 0: print("ALLPATHS RunAllPathsLG terminated with an error. Please", "check running folder for more informations") os.chdir("..") return sample_config else: # save results assembly_dir = os.path.join("data_dir", "allpaths", "ASSEMBLIES", "run") if os.path.exists( os.path.join(assembly_dir, "final.assembly.fasta")): exit_code = subprocess.call([ "cp", os.path.join(assembly_dir, "final.contigs.fasta"), "{}.ctg.fasta".format(outputName) ]) exit_code += subprocess.call([ "cp", os.path.join(assembly_dir, "final.assembly.fasta"), "{}.scf.fasta".format(outputName) ]) if not "keep_tmp_files" in flags and exit_code == 0: subprocess.call(["rm", "-r", "data_dir"]) else: print( "something wrong with Allpaths > no contig file generated") os.chdir("..") return sample_config else: print("ALLPATHS PrepareAllPathInputs terminated with an error. " "Please check running folder for more informations") os.chdir("..") return sample_config os.chdir("..") return sample_config